This notebook includes 4 strategies used for data cleaning as follows:

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
import re

from IPython.display import display

#import vaex

Changed working directory path to where my data is saved.

In [5]:
import os
print(os.getcwd())
os.chdir("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips")
/Users/sitebai/Desktop/UCI/fall/bana212da/data_prep
In [3]:
# 
pd.options.display.max_columns = None
pd.options.display.max_rows = None
In [278]:
months = ['jan','feb','mar','apr','may','jun','july','aug','sep','oct','nov1','nov15','nov19','dec']
la = 'la_'
names = {}
for i in months: 
    names[la +i] = pd.read_csv("listings_" + i +".csv")
   


    
/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3058: DtypeWarning: Columns (43,61,62) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3058: DtypeWarning: Columns (61,62) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [ ]:
spring_listing

Given that the objective of this project is predicting listing price using various machin learning models instead of predicting booked rate, I put calendar dataset aside for now and may come back to study it later.

2020 summer

In [385]:
cal = 'cal_'
current = {}
for i in summer:
    current[cal+i] = pd.read_csv("calendar_2020_" + i +".csv")
In [387]:
weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
In [388]:
for i in current:
        current[i]['date'] = pd.to_datetime(current[i]['date'])
        current[i]['weekday'] = current[i]['date'].apply(lambda x : x.weekday())
        current[i]['weekdays'] = current[i]['weekday'].apply(lambda x : weekDays[x])
In [ ]:
 
In [390]:
current_keys = list(current.keys())
current_summer = current['cal_july']
for i in current_keys:
    current_summer= pd.concat([current_summer,current[i]], axis = 0)
In [391]:
current_summer.shape
Out[391]:
(49058842, 9)
In [460]:
current_summer.shape
Out[460]:
(49058842, 9)
In [394]:
current_summer.date = current_summer.date.dt.month
In [395]:
current_summer.date.value_counts()
Out[395]:
8     4167101
7     4166665
12    4166555
10    4166555
5     4166555
3     4166555
1     4166555
9     4032511
11    4032150
6     4032150
4     4032150
2     3763340
Name: date, dtype: int64
In [396]:
current_df = current_summer[(current_summer['date'] == 7) | (current_summer['date'] == 8) | (current_summer['date'] == 9)]
In [397]:
current_df.shape
Out[397]:
(12366277, 9)
In [398]:
current_df.date.value_counts()
Out[398]:
8    4167101
7    4166665
9    4032511
Name: date, dtype: int64
In [466]:
current_df.isnull().sum()
Out[466]:
listing_id           0
date                 0
available            0
price              768
adjusted_price     768
minimum_nights    1108
maximum_nights    1108
weekday              0
weekdays             0
dtype: int64
In [399]:
current_df = current_df.dropna()
In [400]:
current_df.shape
Out[400]:
(12364401, 9)
In [ ]:
 
In [401]:
current_df['price'] = current_df['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
current_df['adjusted_price'] = current_df['adjusted_price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
In [402]:
current_df['available'] = current_df['available'].map(dict(f=1,t=0))
In [404]:
current_df.head()
Out[404]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
0 43849914 7 1 54.0 54.0 1.0 1125.0 4 Friday
1 43849914 7 1 54.0 54.0 1.0 1125.0 5 Saturday
2 43849914 7 1 54.0 54.0 1.0 1125.0 6 Sunday
3 43849914 7 1 54.0 54.0 1.0 1125.0 0 Monday
4 43849914 7 1 54.0 54.0 1.0 1125.0 1 Tuesday
In [403]:
current_df.shape
Out[403]:
(12364401, 9)
In [405]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [406]:
current_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2020.csv')
In [ ]:
 
In [421]:
current_booked = current_df.groupby('date').agg({'available':sum})
In [415]:
current_listings = current_df.groupby('date')['price'].count().to_frame()
In [417]:
current_listings = current_listings.reset_index()
In [419]:
current_listings
Out[419]:
date price
0 7 4166206
1 8 4166246
2 9 4031949
In [423]:
current_booked = current_booked.reset_index()
In [424]:
current_booked
Out[424]:
date available
0 7 2541563
1 8 2216550
2 9 1875621
In [425]:
current_booked['booked_rate'] =  round(current_booked['available'] / current_listings['price'],3)
In [429]:
current_booked_avg_price = current_df.groupby('date').agg({'price': mean})
In [431]:
current_booked_avg_price.reset_index(inplace = True)
In [434]:
current_booked_avg_price.price = round(current_booked_avg_price.price,2)
In [435]:
current_booked['avg_price'] = current_booked_avg_price.price
In [439]:
current_booked.rename(columns = {'available':'booked'},inplace= True)
In [441]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [443]:
current_booked.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2020_booked_rate.csv')
In [479]:
current_booked
Out[479]:
date booked booked_rate avg_price
0 7 2541563 0.610 237.76
1 8 2216550 0.532 235.07
2 9 1875621 0.465 227.19
In [492]:
current_weekdays_booked = current_df.groupby('weekdays').agg({'available':sum})
In [493]:
current_weekdays_listings = current_df.groupby('weekdays')['price'].count().to_frame()
In [494]:
current_weekdays_booked['booked_rate']  =round(current_weekdays_booked['available'] / current_weekdays_listings['price'],3)

This is 2020 summer weekdays booked rate

In [496]:
current_weekdays_booked.reset_index(inplace = True)
In [497]:
current_weekdays_booked
Out[497]:
weekdays available booked_rate
0 Friday 986386 0.551
1 Monday 922952 0.528
2 Saturday 974043 0.548
3 Sunday 948759 0.538
4 Thursday 966206 0.543
5 Tuesday 919160 0.524
6 Wednesday 916228 0.523
In [499]:
summer_week_booked['booked_rate_2020'] = current_weekdays_booked['booked_rate']
In [500]:
summer_week_booked
Out[500]:
weekdays num_booked booked_rate booked_rate_2020
0 Friday 1386340 0.594 0.551
1 Monday 1316019 0.559 0.528
2 Saturday 1417987 0.597 0.548
3 Sunday 1331527 0.571 0.538
4 Thursday 1389901 0.585 0.543
5 Tuesday 1264599 0.570 0.524
6 Wednesday 1336392 0.573 0.523
In [502]:
summer_week_booked.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_week_2019_2020.csv')
In [501]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [493]:
current_weekdays_booked.sort_values('booked_rate',ascending  =False)a
Out[493]:
available booked_rate
weekdays
Friday 986386 0.551
Saturday 974043 0.548
Thursday 966206 0.543
Sunday 948759 0.538
Monday 922952 0.528
Tuesday 919160 0.524
Wednesday 916228 0.523
In [494]:
summer_week_booked
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-494-b0c2691fefd2> in <module>
----> 1 summer_week_booked

NameError: name 'summer_week_booked' is not defined
In [ ]:
 
In [ ]:
 

only half a year

In [179]:
half_year= ['aug','sep','oct','nov1','nov15','nov19','dec','jan']
In [384]:
summer = ['july','aug','sep']
In [5]:
spring = ['jan','feb','mar']
In [333]:
fall = ['apr','may','jun']
In [335]:
calfall = 'fall_'
fall_month = {}
for i in fall:
    fall_month[calfall +i] = pd.read_csv("calendar_"+i+".csv")
In [338]:
for i in fall_month:
        fall_month[i]['date'] = pd.to_datetime(fall_month[i]['date'])
        fall_month[i]['weekday'] = fall_month[i]['date'].apply(lambda x : x.weekday())
        fall_month[i]['weekdays'] = fall_month[i]['weekday'].apply(lambda x : weekDays[x])

Concatenate three fall tables togethe to one data frame named fall_df

In [340]:
fall_month.keys()
Out[340]:
dict_keys(['fall_apr', 'fall_may', 'fall_jun'])
In [342]:
fall_keys = list(fall_month.keys())
fall_df = fall_month['fall_apr']
for i in fall_keys:
    fall_df = pd.concat([fall_df,fall_month[i]], axis = 0)
In [344]:
fall_df.shape
Out[344]:
(64009185, 9)
In [345]:
fall_df = fall_df.dropna()
In [346]:
fall_df.date = fall_df.date.dt.month
In [348]:
fall_df = fall_df[fall_df['date'].isin([4,5,6])]
In [350]:
fall_df.date.value_counts()
Out[350]:
5    5392149
6    5216182
4    5173904
Name: date, dtype: int64
In [351]:
fall_df.price = fall_df.price.astype(str).str.replace("$","").str.replace(",","").astype(float)
fall_df.adjusted_price = fall_df.adjusted_price.astype(str).str.replace("$","").str.replace(",","").astype(float)
In [352]:
fall_df.available = fall_df.available.map(dict(f=1,t=0))
In [353]:
fall_df.head()
Out[353]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
0 9140 4 1 110.0 110.0 2.0 1125.0 5 Saturday
1 9140 4 0 80.0 80.0 2.0 1125.0 6 Sunday
2 9140 4 0 80.0 80.0 2.0 1125.0 0 Monday
3 9140 4 0 80.0 80.0 2.0 1125.0 1 Tuesday
4 9140 4 0 80.0 80.0 2.0 1125.0 2 Wednesday
In [367]:
fall_cleaned = fall_df
In [368]:
fall_cleaned.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/fall_cleaned.csv', index = False)
In [356]:
fall_booked = fall_df.groupby('date').agg({'available':sum})
In [357]:
fall_listings = fall_df.groupby('date')['price'].count().to_frame()
In [358]:
fall_booked['booked_rate'] = round(fall_booked['available'] / fall_listings['price'],3)
In [359]:
fall_booked
Out[359]:
available booked_rate
date
4 3196123 0.618
5 2902812 0.538
6 2575930 0.494

Concatenate 12 months of calendar dataset together as one dateframe named calendar_2019

In [423]:
winter = pd.read_csv('winter_cleaned_updated.csv')
In [429]:
spring = pd.read_csv('spring_cleaned.csv')
In [431]:
summer = pd.read_csv('summer_cleaned.csv')
In [426]:
fall = pd.read_csv('fall_cleaned.csv')
In [362]:
winter.head()
Out[362]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
0 22214 10 1 60.0 60.0 7.0 730.0 1 Tuesday
1 71514 10 1 70.0 70.0 7.0 365.0 1 Tuesday
2 71514 10 1 70.0 70.0 7.0 365.0 2 Wednesday
3 71514 10 1 70.0 70.0 7.0 365.0 3 Thursday
4 71514 10 1 70.0 70.0 7.0 365.0 4 Friday
In [425]:
fall = fall_cleaned
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-425-9f5b65389c1c> in <module>
----> 1 fall = fall_cleaned

NameError: name 'fall_cleaned' is not defined
In [365]:
spring.head()
Out[365]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
0 2732 1 0 140.0 140.0 1.0 180.0 5 Saturday
1 2732 1 0 140.0 140.0 1.0 180.0 6 Sunday
2 2732 1 0 140.0 140.0 1.0 180.0 0 Monday
3 2732 1 0 140.0 140.0 1.0 180.0 1 Tuesday
4 2732 1 0 140.0 140.0 1.0 180.0 2 Wednesday
In [444]:
summer.head()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-444-f7b4eb2e9332> in <module>
----> 1 summer.head()

AttributeError: 'list' object has no attribute 'head'
In [432]:
summer_spring = spring.append(summer)
In [433]:
summer_spring.date.value_counts()
Out[433]:
8    5513464
7    5469156
1    5385343
3    5341667
9    5333698
2    4907128
Name: date, dtype: int64
In [427]:
winter_fall = fall.append(winter)
In [398]:
winter_fall.shape
Out[398]:
(35315964, 9)
In [434]:
winter_fall.date.value_counts()
Out[434]:
5     5392149
12    5232372
6     5216182
10    5182414
4     5173904
11    5060197
Name: date, dtype: int64
In [435]:
calendar_2019  = summer_spring.append(winter_fall)
In [436]:
calendar_2019.shape
Out[436]:
(63207674, 9)
In [437]:
calendar_2019.date.value_counts()
Out[437]:
8     5513464
7     5469156
5     5392149
1     5385343
3     5341667
9     5333698
12    5232372
6     5216182
10    5182414
4     5173904
11    5060197
2     4907128
Name: date, dtype: int64
In [413]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [438]:
calendar_2019.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_2019_updated.csv")

check the price seasonality by using OLS to see their coefficients.

In [439]:
calendar_2019.weekdays.dtype
Out[439]:
dtype('O')
In [440]:
calendar_2019.date = calendar_2019.date.astype('category')
In [441]:
calendar_2019.weekdays = calendar_2019.weekdays.astype('category')
In [435]:
calendar_2019.price.dtype
Out[435]:
dtype('float64')
In [442]:
calendar_booked = calendar_2019.groupby('date').agg({'available':sum})
In [443]:
calendar_listings = calendar_2019.groupby('date')['price'].count().to_frame()
calendar_price = calendar_2019.groupby('date').agg({'price':mean()})
In [464]:
calendar_listings = calendar_listings.reset_index()\
In [465]:
calendar_listings
Out[465]:
date price
0 1 5385343
1 2 4907128
2 3 5341667
3 4 5173904
4 5 5392149
5 6 5216182
6 7 5469071
7 8 5513237
8 9 5333407
9 10 5182414
10 11 5060197
11 12 5232372
In [450]:
calendar_price = calendar_2019.groupby('date').agg({'price':mean})
In [452]:
calendar_price['price'] = round(calendar_price['price'],2)
In [457]:
calendar_price = calendar_price.reset_index()
In [444]:
calendar_booked['booked_rate'] = round(calendar_booked.available / calendar_listings.price,3)
In [445]:
calendar_booked_df = calendar_booked.reset_index()
In [459]:
calendar_booked_df['avg_price'] = calendar_price['price']
In [466]:
calendar_booked_df['listings'] =  calendar_listings['price']
In [467]:
calendar_booked_df
Out[467]:
date available booked_rate avg_price listings
0 1 3196969 0.594 202.19 5385343
1 2 2549200 0.519 197.50 4907128
2 3 2327712 0.436 195.48 5341667
3 4 3196123 0.618 207.32 5173904
4 5 2902812 0.538 205.93 5392149
5 6 2575930 0.494 209.84 5216182
6 7 3612729 0.661 224.11 5469071
7 8 3274042 0.594 223.15 5513237
8 9 2555994 0.479 216.20 5333407
9 10 3258788 0.629 219.33 5182414
10 11 2881728 0.569 217.86 5060197
11 12 2588505 0.495 219.95 5232372
In [447]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [468]:
calendar_booked_df.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_booked_df_with_listings.csv', index = False)
In [488]:
#calendar_booked_df.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_booked_df.csv', index = False)
In [273]:
calendar_2019_booked_rate = calendar_booked_df.sort_values('booked_rate',ascending = False)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-273-4901a50b3306> in <module>
----> 1 calendar_2019_booked_rate = calendar_booked.sort_values('booked_rate',ascending = False)

NameError: name 'calendar_booked' is not defined
In [ ]:
 
In [7]:
import os
In [9]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/data_prep
In [12]:
calendar_booked_df = pd.read_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_booked_df.csv')
calendar_2019 = pd.read_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_2019.csv')
In [104]:
calendar_2019 = pd.read_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_2019.csv')
In [296]:
.yearly_booked = calendar_2019.groupby(['date','weekdays']).agg({'available':sum})
In [387]:
calendar_2019.groupby('date')['price'].count().to_frame()
Out[387]:
price
date
1 5385343
2 4907128
3 5341667
4 5173904
5 5392149
6 5216182
7 5469037
8 5513189
9 5333407
10 1665791
11 1665780
12 1721301
In [297]:
yearly_listings = calendar_2019.groupby(['date','weekdays'])['price'].count().to_frame()
In [298]:
yearly_booked['booked_rate'] = round(yearly_booked['available'] / yearly_listings['price'],3)
In [481]:
yearly_booked_df= yearly_booked.reset_index()
In [299]:
yearly_booked_rate = yearly_booked.reset_index()
In [300]:
yearly_booked_rate
Out[300]:
date weekdays available booked_rate
0 1 Friday 496314 0.612
1 1 Monday 404611 0.582
2 1 Saturday 463665 0.616
3 1 Sunday 414195 0.596
4 1 Thursday 509606 0.587
... ... ... ... ...
79 12 Saturday 100756 0.454
80 12 Sunday 124817 0.450
81 12 Thursday 98266 0.442
82 12 Tuesday 122953 0.443
83 12 Wednesday 97192 0.438

84 rows × 4 columns

In [301]:
yearly_booked_rate.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/yearly_booked_rate.csv', index = False)
In [484]:
yearly_booked_rate
Out[484]:
date weekdays available booked_rate
0 1 Friday 496314 0.612
1 1 Monday 404611 0.582
2 1 Saturday 463665 0.616
3 1 Sunday 414195 0.596
4 1 Thursday 509606 0.587
5 1 Tuesday 401555 0.578
6 1 Wednesday 507023 0.584
7 2 Friday 348910 0.536
8 2 Monday 364075 0.512
9 2 Saturday 400815 0.543
10 2 Sunday 383861 0.531
11 2 Thursday 353913 0.509
12 2 Tuesday 348740 0.502
13 2 Wednesday 348886 0.502
14 3 Friday 364274 0.441
15 3 Monday 299525 0.431
16 3 Saturday 363657 0.441
17 3 Sunday 371203 0.427
18 3 Thursday 326682 0.448
19 3 Tuesday 296822 0.427
20 3 Wednesday 305549 0.434
21 4 Friday 454315 0.635
22 4 Monday 420578 0.600
23 4 Saturday 499696 0.646
24 4 Sunday 431593 0.615
25 4 Thursday 489266 0.619
26 4 Tuesday 415979 0.593
27 4 Wednesday 484696 0.614
28 5 Friday 486169 0.555
29 5 Monday 382659 0.528
30 5 Saturday 426479 0.572
31 5 Sunday 426162 0.556
32 5 Thursday 416779 0.529
33 5 Tuesday 360728 0.514
34 5 Wednesday 403836 0.512
35 6 Friday 356860 0.509
36 6 Monday 340428 0.485
37 6 Saturday 417925 0.502
38 6 Sunday 397469 0.478
39 6 Thursday 350617 0.500
40 6 Tuesday 347331 0.488
41 6 Wednesday 365300 0.496
42 7 Friday 540975 0.670
43 7 Monday 482775 0.653
44 7 Saturday 483707 0.674
45 7 Sunday 469670 0.655
46 7 Thursday 534423 0.662
47 7 Tuesday 515303 0.657
48 7 Wednesday 585876 0.654
49 8 Friday 494013 0.611
50 8 Monday 443840 0.582
51 8 Saturday 545179 0.608
52 8 Sunday 455817 0.598
53 8 Thursday 511756 0.602
54 8 Tuesday 411177 0.573
55 8 Wednesday 412260 0.575
56 9 Friday 351352 0.490
57 9 Monday 389404 0.457
58 9 Saturday 389101 0.511
59 9 Sunday 406040 0.477
60 9 Thursday 343722 0.479
61 9 Tuesday 338119 0.471
62 9 Wednesday 338256 0.472
63 10 Friday 629327 0.639
64 10 Monday 537771 0.621
65 10 Saturday 625319 0.639
66 10 Sunday 537454 0.626
67 10 Thursday 675406 0.629
68 10 Tuesday 592686 0.630
69 10 Wednesday 537531 0.626
70 11 Friday 597605 0.602
71 11 Monday 482806 0.562
72 11 Saturday 596410 0.601
73 11 Sunday 505434 0.589
74 11 Thursday 494190 0.576
75 11 Tuesday 478315 0.557
76 11 Wednesday 514546 0.572
77 12 Friday 412261 0.480
78 12 Monday 499505 0.465
79 12 Saturday 416905 0.486
80 12 Sunday 511663 0.477
81 12 Thursday 401109 0.467
82 12 Tuesday 498611 0.465
83 12 Wednesday 393833 0.459
In [477]:
month_1 = yearly_booked_rate[yearly_booked_rate['date'] ==1].sort_values('booked_rate',ascending = False)
In [478]:
month_1
Out[478]:
date weekdays available booked_rate
2 1 Saturday 463665 0.616
0 1 Friday 496314 0.612
3 1 Sunday 414195 0.596
4 1 Thursday 509606 0.587
6 1 Wednesday 507023 0.584
1 1 Monday 404611 0.582
5 1 Tuesday 401555 0.578
In [479]:
sns.lineplot(data=month_1, x="weekdays", y="booked_rate")
Out[479]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feb2ef07650>
In [ ]:
 
In [307]:
calendar_2019.head()
Out[307]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
0 2732 1 0 140.0 140.0 1.0 180.0 5 Saturday
1 2732 1 0 140.0 140.0 1.0 180.0 6 Sunday
2 2732 1 0 140.0 140.0 1.0 180.0 0 Monday
3 2732 1 0 140.0 140.0 1.0 180.0 1 Tuesday
4 2732 1 0 140.0 140.0 1.0 180.0 2 Wednesday
In [469]:
calendar_weekdays_booked = calendar_2019.groupby('weekdays').agg({'available':sum})
calendar_weekdays_listings = calendar_2019.groupby('weekdays')['price'].count().to_frame()
calendar_weekdays_booked['booked_rate'] = round(calendar_weekdays_booked.available / calendar_weekdays_listings.price,3)
In [483]:
calendar_weekdays_avgprice = calendar_2019.groupby('weekdays').agg({'price':mean})
In [484]:
calendar_weekdays_avgprice = calendar_weekdays_avgprice.reset_index()
In [486]:
calendar_weekdays_avgprice['price'] =  round(calendar_weekdays_avgprice['price'] ,2)
In [470]:
calendar_weekdays_total= calendar_weekdays_listings.reset_index()
In [474]:
calendar_weekdays_booked = calendar_weekdays_booked.reset_index()
In [479]:
calendar_weekdays_listings = calendar_weekdays_listings.reset_index()
In [480]:
calendar_weekdays_listings
Out[480]:
weekdays price
0 Friday 9154113
1 Monday 8798426
2 Saturday 9266750
3 Sunday 9127919
4 Thursday 9156360
5 Tuesday 8709982
6 Wednesday 8993521
In [481]:
calendar_weekdays_booked['listings'] = calendar_weekdays_listings['price']
In [487]:
calendar_weekdays_booked['avg_price'] = calendar_weekdays_avgprice['price']
In [488]:
calendar_weekdays_booked
Out[488]:
weekdays available booked_rate listings avg_price
0 Friday 5192768 0.567 9154113 217.53
1 Monday 4744961 0.539 8798426 208.69
2 Saturday 5278786 0.570 9266750 218.10
3 Sunday 4994889 0.547 9127919 209.67
4 Thursday 5090136 0.556 9156360 209.79
5 Tuesday 4705134 0.540 8709982 208.78
6 Wednesday 4913858 0.546 8993521 208.70
In [311]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/data_prep
In [489]:
calendar_weekdays_booked.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_weekdays_booked_final_with_avg_price.csv', index = False)
In [105]:
calendar_2019.dropna(inplace = True)
In [491]:
month_index = calendar_2019["date"].astype('category')
In [492]:
month_index.head()
Out[492]:
0    1
1    1
2    1
3    1
4    1
Name: date, dtype: category
Categories (12, int64): [1, 2, 3, 4, ..., 9, 10, 11, 12]

Run ols on months and weekdays to check seasonality

In [2]:
import statsmodels.api as stat
/opt/anaconda3/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [493]:
month_dummies_new =pd.get_dummies(month_index)
In [495]:
month_dummies_new.head()
Out[495]:
1 2 3 4 5 6 7 8 9 10 11 12
0 1 0 0 0 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 0 0
3 1 0 0 0 0 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0 0 0 0
In [490]:
month_dummies.head()
Out[490]:
1 2 3 4 5 6 7 8 9 10 11 12
0 1 0 0 0 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 0 0
3 1 0 0 0 0 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0 0 0 0
In [126]:
month_dummies.head()
Out[126]:
1 2 3 4 5 6 7 8 9 10 11 12
0 1 0 0 0 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 0 0
3 1 0 0 0 0 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0 0 0 0
In [91]:
calendar_months_dummies= calendar_month_dummies.drop(calendar_month_dummies.columns[[0]], axis = 1)
In [92]:
calendar_months_dummies.head()
Out[92]:
2 3 4 5 6 7 8 9 10 11 12
0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0
In [6]:
calendar_month_y = calendar_2019["price"]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-3bdca46b50b0> in <module>
----> 1 calendar_month_y = calendar_2019["price"]

NameError: name 'calendar_2019' is not defined
In [499]:
calendar_month_y.shape
Out[499]:
(63207674,)
In [500]:
month_dummies_new.shape
Out[500]:
(63207674, 12)
In [84]:
if isinstance(calendar_2019, pd.DataFrame):
    print('yes')
    
yes
In [ ]:
month_dummies.head()
In [128]:
month_ols= stat.OLS(calendar_month_y, month_dummies)
In [ ]:
 
In [3]:
month_new_ols = stat.OLS(calendar_month_y, month_dummies_new)
month_new_ols_result = month_new_ols.fit()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-0a4576cc57fc> in <module>
----> 1 month_new_ols = stat.OLS(calendar_month_y, month_dummies_new)
      2 month_new_ols_result = month_new_ols.fit()

NameError: name 'calendar_month_y' is not defined
In [ ]:
print(month_new_ols_result.summary())
In [129]:
month_ols_result = month_ols.fit()
In [130]:
print(month_ols_result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1415.
Date:                Thu, 03 Dec 2020   Prob (F-statistic):               0.00
Time:                        11:37:44   Log-Likelihood:            -4.0732e+08
No. Observations:            52784878   AIC:                         8.146e+08
Df Residuals:                52784866   BIC:                         8.146e+08
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
1            202.1873      0.234    863.641      0.000     201.728     202.646
2            197.5009      0.245    805.296      0.000     197.020     197.982
3            195.4801      0.235    831.599      0.000     195.019     195.941
4            207.3177      0.239    867.997      0.000     206.850     207.786
5            205.9274      0.234    880.173      0.000     205.469     206.386
6            209.8406      0.238    882.143      0.000     209.374     210.307
7            224.1127      0.232    964.706      0.000     223.657     224.568
8            223.1473      0.231    964.420      0.000     222.694     223.601
9            216.2037      0.235    919.049      0.000     215.743     216.665
10           209.2649      0.421    497.141      0.000     208.440     210.090
11           208.7173      0.421    495.838      0.000     207.892     209.542
12           213.0776      0.414    514.564      0.000     212.266     213.889
==============================================================================
Omnibus:                105062232.199   Durbin-Watson:                   0.059
Prob(Omnibus):                  0.000   Jarque-Bera (JB):     395302243367.115
Skew:                          16.176   Prob(JB):                         0.00
Kurtosis:                     425.715   Cond. No.                         1.82
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [133]:
calendar_2019.dtypes
Out[133]:
listing_id          int64
date                int64
available           int64
price             float64
adjusted_price    float64
minimum_nights    float64
maximum_nights    float64
weekday             int64
weekdays           object
dtype: object
In [134]:
booked_month = calendar_2019["available"].astype('category')
In [135]:
booked = calendar_2019["available"]
In [137]:
seasonality_booked_model= stat.OLS(booked, month_dummies)
seasonality_booked_result  = seasonality_booked_model.fit()
In [138]:
print(seasonality_booked_result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              available   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                 9.760e+04
Date:                Thu, 03 Dec 2020   Prob (F-statistic):               0.00
Time:                        12:03:15   Log-Likelihood:            -3.7541e+07
No. Observations:            52784878   AIC:                         7.508e+07
Df Residuals:                52784866   BIC:                         7.508e+07
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
1              0.5936      0.000   2795.711      0.000       0.593       0.594
2              0.5195      0.000   2335.344      0.000       0.519       0.520
3              0.4358      0.000   2043.861      0.000       0.435       0.436
4              0.6177      0.000   2851.510      0.000       0.617       0.618
5              0.5383      0.000   2536.872      0.000       0.538       0.539
6              0.4938      0.000   2288.855      0.000       0.493       0.494
7              0.6606      0.000   3134.956      0.000       0.660       0.661
8              0.5938      0.000   2829.517      0.000       0.593       0.594
9              0.4792      0.000   2245.884      0.000       0.479       0.480
10             0.6342      0.000   1661.032      0.000       0.633       0.635
11             0.5276      0.000   1381.868      0.000       0.527       0.528
12             0.4455      0.000   1186.201      0.000       0.445       0.446
==============================================================================
Omnibus:                193317225.867   Durbin-Watson:                   0.202
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          8113760.672
Skew:                          -0.184   Prob(JB):                         0.00
Kurtosis:                       1.115   Cond. No.                         1.82
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [139]:
weekdays_booked = stat.OLS(booked, weekdays_dummy)
weekdays_booked_result = weekdays_booked.fit()
In [140]:
print(weekdays_booked_result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              available   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     4882.
Date:                Thu, 03 Dec 2020   Prob (F-statistic):               0.00
Time:                        12:06:32   Log-Likelihood:            -3.8058e+07
No. Observations:            52784878   AIC:                         7.612e+07
Df Residuals:                52784871   BIC:                         7.612e+07
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Friday         0.5621      0.000   3121.613      0.000       0.562       0.562
Monday         0.5340      0.000   2900.646      0.000       0.534       0.534
Saturday       0.5652      0.000   3165.197      0.000       0.565       0.566
Sunday         0.5417      0.000   3008.962      0.000       0.541       0.542
Thursday       0.5511      0.000   3066.801      0.000       0.551       0.551
Tuesday        0.5346      0.000   2880.818      0.000       0.534       0.535
Wednesday      0.5412      0.000   2994.073      0.000       0.541       0.542
==============================================================================
Omnibus:                184062056.465   Durbin-Watson:                   0.199
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          8781040.020
Skew:                          -0.190   Prob(JB):                         0.00
Kurtosis:                       1.038   Cond. No.                         1.04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
print(seasonality_booked_result.summary())
In [429]:
calendar_weekdays_dummies  = pd.get_dummies(calendar_2019['weekdays'])
In [119]:
calendar_weekdays = calendar_2019['weekdays'].astype('category')
In [120]:
weekdays_dummy = pd.get_dummies(calendar_weekdays)
In [112]:
calendar_2019.columns
Out[112]:
Index(['listing_id', 'date', 'available', 'price', 'adjusted_price',
       'minimum_nights', 'maximum_nights', 'weekday', 'weekdays'],
      dtype='object')
In [121]:
weekdays_ols = stat.OLS(calendar_weekdays_y, weekdays_dummy)
weekdays_result = weekdays_ols.fit()
In [122]:
print(weekdays_result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     501.3
Date:                Thu, 03 Dec 2020   Prob (F-statistic):               0.00
Time:                        09:01:28   Log-Likelihood:            -4.0732e+08
No. Observations:            52784878   AIC:                         8.146e+08
Df Residuals:                52784871   BIC:                         8.146e+08
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Friday       215.4033      0.197   1095.471      0.000     215.018     215.789
Monday       206.2049      0.201   1025.794      0.000     205.811     206.599
Saturday     216.0133      0.195   1107.910      0.000     215.631     216.395
Sunday       207.3212      0.197   1054.606      0.000     206.936     207.706
Thursday     207.5816      0.196   1057.891      0.000     207.197     207.966
Tuesday      206.0460      0.203   1016.949      0.000     205.649     206.443
Wednesday    206.6038      0.197   1046.764      0.000     206.217     206.991
==============================================================================
Omnibus:                105064581.384   Durbin-Watson:                   0.059
Prob(Omnibus):                  0.000   Jarque-Bera (JB):     395278389801.514
Skew:                          16.177   Prob(JB):                         0.00
Kurtosis:                     425.702   Cond. No.                         1.04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [113]:
weekdays_index = calendar_2019['weekday'].astype('category')
In [115]:
weekdays_dummies = pd.get_dummies(weekdays_index)
In [ ]:
 
In [116]:
calendar_weekdays_y = calendar_2019['price']
In [117]:
calendar_weekdays_ols = stat.OLS(calendar_weekdays_y, weekdays_dummies)
calendar_weekdays_result = calendar_weekdays_ols.fit()
In [118]:
print(calendar_weekdays_result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     501.3
Date:                Thu, 03 Dec 2020   Prob (F-statistic):               0.00
Time:                        08:56:24   Log-Likelihood:            -4.0732e+08
No. Observations:            52784878   AIC:                         8.146e+08
Df Residuals:                52784871   BIC:                         8.146e+08
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
0            206.2049      0.201   1025.794      0.000     205.811     206.599
1            206.0460      0.203   1016.949      0.000     205.649     206.443
2            206.6038      0.197   1046.764      0.000     206.217     206.991
3            207.5816      0.196   1057.891      0.000     207.197     207.966
4            215.4033      0.197   1095.471      0.000     215.018     215.789
5            216.0133      0.195   1107.910      0.000     215.631     216.395
6            207.3212      0.197   1054.606      0.000     206.936     207.706
==============================================================================
Omnibus:                105064581.384   Durbin-Watson:                   0.059
Prob(Omnibus):                  0.000   Jarque-Bera (JB):     395278389801.513
Skew:                          16.177   Prob(JB):                         0.00
Kurtosis:                     425.702   Cond. No.                         1.04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Check the booked rate seasonality

In [ ]:
 
In [ ]:
 
In [ ]:
calendar = pd.concat()
In [8]:
cal = 'cal_'
calendars = {}
for i in half_year:
    calendars[cal+i] = pd.read_csv("calendar_" + i +".csv")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-bdb4c1344bc8> in <module>
      1 cal = 'cal_'
      2 calendars = {}
----> 3 for i in half_year:
      4     calendars[cal+i] = pd.read_csv("calendar_" + i +".csv")

NameError: name 'half_year' is not defined

Study spring price pattern and booked rate trend .

In [297]:
calspring  = 'spring_'
spring_month = {}
for i in spring:
    spring_month[calspring +i] = pd.read_csv("calendar_"+i+".csv")
    
In [298]:
 for i in spring_month:
        spring_month[i]['date'] = pd.to_datetime(spring_month[i]['date'])
        spring_month[i]['weekday'] = spring_month[i]['date'].apply(lambda x : x.weekday())
        spring_month[i]['weekdays'] = spring_month[i]['weekday'].apply(lambda x : weekDays[x])

Concatenate three spring tables togethe to one data frame named spring_df

In [299]:
spring_keys = list(spring_month.keys())
spring_df = spring_month['spring_jan']
for i in spring_keys:
    spring_df = pd.concat([spring_df,spring_month[i]], axis = 0)

Dropping na values.

In [225]:
spring_df.shape
Out[225]:
(63402039, 9)
In [226]:
spring_df.isnull().sum()
Out[226]:
listing_id          0
date                0
available           0
price             356
adjusted_price    356
minimum_nights     20
maximum_nights     20
weekday             0
weekdays            0
dtype: int64
In [300]:
spring_nona = spring_df.dropna()

Converting datatypes to proper one

In [301]:
spring_nona['date'] = spring_nona['date'].dt.month
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [302]:
spring_cleaned = spring_nona[(spring_nona['date'] == 1) |(spring_nona['date'] == 2) | (spring_nona['date'] == 3)]
In [303]:
spring_cleaned['price']= spring_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [304]:
spring_cleaned['adjusted_price'] = spring_cleaned['adjusted_price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [305]:
spring_cleaned.available = spring_cleaned.available.map(dict(f=1,t=0))
In [306]:
spring_category = ['date','weekday']
In [307]:
spring_cleaned[spring_category] = spring_cleaned[spring_category].astype('category')
In [308]:
spring_cleaned.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/spring_cleaned.csv', index = False)
In [260]:
spring_cleaned.head()
Out[260]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
0 2732 1 0 140.0 140.0 1.0 180.0 5 Saturday
1 2732 1 0 140.0 140.0 1.0 180.0 6 Sunday
2 2732 1 0 140.0 140.0 1.0 180.0 0 Monday
3 2732 1 0 140.0 140.0 1.0 180.0 1 Tuesday
4 2732 1 0 140.0 140.0 1.0 180.0 2 Wednesday
In [261]:
spring_cleaned.dtypes
Out[261]:
listing_id           int64
date              category
available            int64
price              float64
adjusted_price     float64
minimum_nights     float64
maximum_nights     float64
weekday           category
weekdays            object
dtype: object
In [265]:
spring_booked = spring_cleaned.groupby('date').agg({"available": sum})
In [267]:
spring_booked
Out[267]:
available
date
1 3196969
2 2549200
3 2327712
In [269]:
spring_listed = spring_cleaned.groupby('date')['price'].count().to_frame()
In [270]:
spring_listed
Out[270]:
price
date
1 5385343
2 4907128
3 5341667
In [275]:
spring_booked['booked_rate'] = round(spring_booked['available'] / spring_listed['price'],2)
In [277]:
spring_booked.reset_index(level  = 0, inplace = True)
In [279]:
spring_booked.rename(columns = {"date":'month','available':'num_booked'})
Out[279]:
month num_booked booked_rate
0 1 3196969 0.59
1 2 2549200 0.52
2 3 2327712 0.44
In [382]:
spring_week_listing = spring_cleaned.groupby('weekdays')['price'].count().to_frame()
spring_week_booked = spring_cleaned.groupby('weekdays').agg({'available' : sum})
spring_week_booked['booked_rate'] = round(spring_week_booked['available'] / spring_week_listing['price'],3)
spring_week_booked.reset_index(level = 0, inplace= True)
spring_week_booked.rename(columns = {'available' : 'num_booked'})
Out[382]:
weekdays num_booked booked_rate
0 Friday 1209498 0.529
1 Monday 1068211 0.509
2 Saturday 1228137 0.530
3 Sunday 1169259 0.511
4 Thursday 1190201 0.519
5 Tuesday 1047117 0.502
6 Wednesday 1161458 0.512
In [383]:
spring_week_booked.sort_values('booked_rate', ascending =False)
Out[383]:
weekdays available booked_rate
2 Saturday 1228137 0.530
0 Friday 1209498 0.529
4 Thursday 1190201 0.519
6 Wednesday 1161458 0.512
3 Sunday 1169259 0.511
1 Monday 1068211 0.509
5 Tuesday 1047117 0.502
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Winter adding dec

In [401]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [400]:
os.chdir("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips")
In [402]:
winter = ['oct','nov19','dec']
calwinter = "winter_"
winter_month = {}
for i in winter:
    winter_month[calwinter + i] = pd.read_csv("calendar_" + i +".csv")
    
In [403]:
for i in winter_month:
    print(i)
winter_oct
winter_nov19
winter_dec
In [407]:
for i in winter_month:
    winter_month[i]['date'] = pd.to_datetime(winter_month[i]['date'])
    winter_month[i]['weekday'] = winter_month[i]['date'].apply(lambda x : x.weekday())
    winter_month[i]['weekdays'] = winter_month[i]['weekday'].apply(lambda x : weekDays[x])
In [408]:
winter_df = winter_month['winter_oct']
In [409]:
winter_keys = list(winter_month.keys())
for i in winter_month:
    winter_df = pd.concat([winter_df,winter_month[i]], axis = 0)
In [410]:
winter_df.shape
Out[410]:
(62068744, 9)
In [411]:
winter_df['date'] = winter_df['date'].dt.month
In [412]:
winter_df['date'].value_counts()
Out[412]:
8     5271395
7     5271395
5     5271395
3     5271395
1     5271395
12    5233436
10    5182587
9     5101350
6     5101350
4     5101350
11    5060391
2     4931305
Name: date, dtype: int64
In [413]:
winter_df = winter_df[(winter_df['date'] == 10) | (winter_df['date'] == 11) |(winter_df['date'] == 12)]
In [414]:
winter_df.shape
Out[414]:
(15476414, 9)
In [415]:
winter_df.isnull().sum()
Out[415]:
listing_id           0
date                 0
available            0
price              428
adjusted_price     428
minimum_nights    1003
maximum_nights    1003
weekday              0
weekdays             0
dtype: int64

Removing missing values from winter_df

In [416]:
winter_cleaned = winter_df.dropna()
In [417]:
winter_cleaned['price'] = winter_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [418]:
winter_cleaned['adjusted_price'] = winter_cleaned['adjusted_price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [298]:
winter_cleaned.dtypes
Out[298]:
listing_id          int64
date                int64
available          object
price              object
adjusted_price     object
minimum_nights    float64
maximum_nights    float64
weekday             int64
weekdays           object
dtype: object

f refers to the listing is booked, and t refers to the listing is available.

In [419]:
winter_cleaned['available'] = winter_cleaned['available'].map(dict(f=1,t=0))
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [420]:
winter_category = ['date','weekday']
winter_cleaned[winter_category] = winter_cleaned[winter_category].astype('category')
/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:3069: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
In [421]:
winter_cleaned.shape
Out[421]:
(15474983, 9)
In [422]:
winter_cleaned.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/winter_cleaned_updated.csv', index = False)
In [357]:
winter_booked = winter_cleaned.groupby('date').agg({"available":sum})
In [358]:
winter_listings = winter_cleaned.groupby('date')['price'].count().to_frame()
In [314]:
winter_booked['booked_rate'] = round(winter_booked['available'] / winter_listings['price'],2)
In [316]:
winter_booked.reset_index(level=0, inplace = True)
In [317]:
winter_booked
Out[317]:
date available booked_rate
0 10 4135494 0.63
1 11 3669306 0.58
2 12 3133887 0.47
In [320]:
winter_booked = winter_booked.rename(columns = {'date':'month','available':'num_booked'})
In [359]:
winter_week_booked = winter_cleaned.groupby('weekdays').agg({'available' : sum})
In [360]:
winter_week_listing = winter_cleaned.groupby('weekdays')['price'].count().to_frame()
In [361]:
winter_week_booked['booked_rate'] = round( winter_week_booked['available'] / winter_week_listing['price'],3)
In [363]:
winter_week_booked.reset_index(level = 0, inplace=True)
In [365]:
winter_week_booked = winter_week_booked.rename(columns = {'available':'booked'})
In [366]:
winter_week_booked.sort_values('booked_rate', ascending  = False)
Out[366]:
weekdays booked booked_rate
2 Saturday 1638634 0.579
0 Friday 1639193 0.578
4 Thursday 1570705 0.563
3 Sunday 1554551 0.557
6 Wednesday 1445910 0.553
5 Tuesday 1569612 0.547
1 Monday 1520082 0.543
In [334]:
winter_df.shape
Out[334]:
(19534409, 9)
In [446]:
summer_cleaned.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-446-05fbbf009289> in <module>
----> 1 summer_cleaned.shape

NameError: name 'summer_cleaned' is not defined
In [337]:
spring_cleaned.shape
Out[337]:
(15634138, 9)
In [321]:
winter_booked
Out[321]:
month num_booked booked_rate
0 10 4135494 0.63
1 11 3669306 0.58
2 12 3133887 0.47
In [445]:
summer_booked
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-445-d91becf9ee73> in <module>
----> 1 summer_booked

NameError: name 'summer_booked' is not defined
In [332]:
spring_booked
Out[332]:
date available booked_rate
0 1 3196969 0.59
1 2 2549200 0.52
2 3 2327712 0.44
In [ ]:
#study summer price treand and booked rate first. 
In [447]:
calsummer ='summer_'
months = {}
for i in summer:
    months[calsummer + i] = pd.read_csv("calendar_" + i + ".csv")
In [405]:
weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
In [182]:
weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
for i in calendars:
    calendars[i]['date'] = pd.to_datetime(calendars[i]['date'])
    calendars[i]['weekday'] = calendars[i]['date'].apply(lambda x : x.weekday())
    calendars[i]['weekdays'] = calendars[i]['weekday'].apply(lambda x : weekDays[x])
    
In [448]:
weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
for i in months:
    months[i]['date'] = pd.to_datetime(months[i]['date'])
    months[i]['weekday'] = months[i]['date'].apply(lambda x : x.weekday())
    months[i]['weekdays'] = months[i]['weekday'].apply(lambda x : weekDays[x])
In [ ]:
def weekday(months):
    
    weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
    for i in months:
        months[i]['date'] = pd.to_datetime(months[i]['date'])
        months[i]['weekday'] = months[i]['date'].apply(lambda x : x.weekday())
        months[i]['weekdays'] = months[i]['weekday'].apply(lambda x : weekDays[x])
In [278]:
#calendars['cal_jan']['date'].dtype

Concatenate 3 summer calendar tables to one table named summer_df

In [449]:
months.keys()
Out[449]:
dict_keys(['summer_july', 'summer_aug', 'summer_sep'])
In [450]:
summer_list = list(months.keys())
summer_df = months['summer_july']
for i in summer_list:
    summer_df = pd.concat([summer_df, months[i]], axis = 0 )
In [132]:
summer_df.shape
Out[132]:
(65442326, 9)
In [ ]:
summer_df['date']
In [451]:
summer_df['date'] = summer_df['date'].dt.month
In [452]:
summer_df['date'].value_counts()
Out[452]:
12    5558052
10    5558052
5     5558052
3     5558052
1     5558052
8     5513464
7     5469156
11    5378760
6     5378760
4     5378760
9     5333698
2     5199468
Name: date, dtype: int64

Filter out data with only July, August, and September

In [453]:
summer_cleaned = summer_df[(summer_df['date'] == 7) | (summer_df['date'] == 8) | (summer_df['date'] == 9) ]
In [136]:
print("After filter out data of July, August, and September, the new dataset summer_cleaned has %s of rows." % summer_cleaned.shape[0])
After filter out data of July, August, and September, the new dataset summer_cleaned has 16316318 of rows.

check missing values in summer_df table

In [454]:
summer_cleaned.isnull().sum()
Out[454]:
listing_id          0
date                0
available           0
price             603
adjusted_price    603
minimum_nights     82
maximum_nights     82
weekday             0
weekdays            0
dtype: int64
In [455]:
summer_nona = summer_cleaned.dropna()
In [140]:
print("After dropping all rows with na values, the shape of the new summer_df is: %s" % summer_nona.shape[0])
print("The original shape is: %s" % summer_df.shape[0])
After dropping all rows with na values, the shape of the new summer_df is: 16315633
The original shape is: 65442326
In [95]:
#duplicated_listing = summer_cleaned[summer_cleaned.duplicated()]
In [97]:
#duplicated_listing.head(50)

Convert certain columns data typers to proper one.

In [141]:
summer_cleaned.dtypes
Out[141]:
listing_id          int64
date                int64
available          object
price              object
adjusted_price     object
minimum_nights    float64
maximum_nights    float64
weekday             int64
weekdays           object
dtype: object
In [272]:
summer_cleaned[summer_cleaned['price'] != summer_cleaned['adjusted_price']].head()
Out[272]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
7384 32963 7 f $180.00 $162.00 1.0 180.0 1 Tuesday
7385 32963 7 f $180.00 $162.00 1.0 180.0 2 Wednesday
8076 7992 7 f $69.00 $68.00 1.0 1250.0 1 Tuesday
8077 7992 7 f $95.00 $93.00 1.0 1250.0 2 Wednesday
8078 7992 7 f $95.00 $93.00 1.0 1250.0 3 Thursday

Convert object data type columb price and adjusted price to float data type

In [456]:
summer_cleaned['price'] = summer_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
summer_cleaned['adjusted_price'] = summer_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [457]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [458]:
summer_cleaned.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2019_cleaned.csv')

Convert inte64 data type of date to category data type.

In [143]:
summer_cleaned.head()
Out[143]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
0 26082 7 f 55.0 55.0 3.0 28.0 0 Monday
1 109 7 f 115.0 115.0 30.0 730.0 1 Tuesday
2 109 7 f 115.0 115.0 30.0 730.0 2 Wednesday
3 109 7 f 115.0 115.0 30.0 730.0 3 Thursday
4 109 7 f 115.0 115.0 30.0 730.0 4 Friday
In [274]:
summer_cleaned.weekdays = summer_cleaned.weekdays.astype('category')
/opt/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py:5208: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
In [275]:
summer_cleaned.weekday = summer_cleaned.weekday.astype('category')
In [276]:
summer_cleaned.date = summer_cleaned.date.astype('category')
In [277]:
summer_cleaned.dtypes
Out[277]:
listing_id           int64
date              category
available           object
price              float64
adjusted_price     float64
minimum_nights     float64
maximum_nights     float64
weekday           category
weekdays          category
dtype: object
In [148]:
summer_cleaned.groupby('date')['price'].mean().sort_values(ascending = False)
Out[148]:
date
7    224.112071
8    223.146741
9    216.203708
Name: price, dtype: float64

Convert available to 1 and 0. 1 refers to booked, 0 means it is available to book.

In [459]:
summer_cleaned.available = summer_cleaned.available.map(dict(f=1,t=0))
/opt/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py:5168: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
In [460]:
summer_cleaned.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2019_cleaned_final.csv')
In [151]:
summer_cleaned.head()
Out[151]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
0 26082 7 1 55.0 55.0 3.0 28.0 0 Monday
1 109 7 1 115.0 115.0 30.0 730.0 1 Tuesday
2 109 7 1 115.0 115.0 30.0 730.0 2 Wednesday
3 109 7 1 115.0 115.0 30.0 730.0 3 Thursday
4 109 7 1 115.0 115.0 30.0 730.0 4 Friday
In [279]:
summer_cleaned.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_cleaned.csv', index = False)
In [461]:
listings_grouped = summer_cleaned.groupby('date')['price'].count().to_frame()
In [462]:
available_grouped['available'] /listings_grouped['price']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-462-5015b83315bc> in <module>
----> 1 available_grouped['available'] /listings_grouped['price']

NameError: name 'available_grouped' is not defined
In [468]:
listings_grouped
Out[468]:
price
date
7 5469071
8 5513237
9 5333407
In [466]:
summer_booked = summer_cleaned.groupby('date').agg({'available' : sum})
In [471]:
summer_avg_price = summer_cleaned.groupby('date').agg({'price' : mean})
In [472]:
summer_avg_price
Out[472]:
price
date
7 224.112071
8 223.146741
9 216.203708
In [469]:
summer_booked['book_rate'] = round(summer_booked['available'] /listings_grouped['price'],2)
In [473]:
summer_booked['avg_price'] = round(summer_avg_price.price,2)
In [475]:
summer_booked.reset_index(inplace = True)
In [476]:
summer_booked
Out[476]:
date available book_rate avg_price
0 7 3612729 0.66 224.11
1 8 3274042 0.59 223.15
2 9 2555994 0.48 216.20
In [477]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [478]:
summer_booked.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2019_avg_price.csv')
In [327]:
summer_booked.reset_index(level = 0,inplace = True)
In [329]:
summer_booked = booked_grouped.rename(columns = {'date':'month','available':'num_booked'})
In [482]:
summer_booked['booked_rate_2020'] = current_booked['booked_rate']
In [483]:
summer_booked
Out[483]:
date available book_rate avg_price booked_rate_2020
0 7 3612729 0.66 224.11 0.610
1 8 3274042 0.59 223.15 0.532
2 9 2555994 0.48 216.20 0.465
In [485]:
summer_booked.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2019_2020.csv")
In [484]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [486]:
summer_week_booked = summer_cleaned.groupby('weekdays').agg({'available' : sum})
In [487]:
summer_week_listed = summer_cleaned.groupby('weekdays')['price'].count().to_frame()
In [488]:
summer_week_booked['booked_rate'] = round(summer_week_booked['available'] / summer_week_listed['price'],3)
In [489]:
summer_week_booked.reset_index(level = 0, inplace = True)
In [490]:
summer_week_booked = summer_week_booked.rename(columns = {'available':'num_booked'})
In [491]:
summer_week_booked
Out[491]:
weekdays num_booked booked_rate
0 Friday 1386340 0.594
1 Monday 1316019 0.559
2 Saturday 1417987 0.597
3 Sunday 1331527 0.571
4 Thursday 1389901 0.585
5 Tuesday 1264599 0.570
6 Wednesday 1336392 0.573
In [376]:
summer_week_booked.sort_values('booked_rate',ascending  = False)
Out[376]:
weekdays num_booked booked_rate
2 Saturday 1417987 0.597
0 Friday 1386340 0.594
4 Thursday 1389901 0.585
6 Wednesday 1336392 0.573
3 Sunday 1331527 0.571
5 Tuesday 1264599 0.570
1 Monday 1316019 0.559

Concatenate 3 winter calendar tables to one table named winter_df

In [ ]:
winter  = ['oct','nov1','nov15','nov19','dec']
In [ ]:
 

Listings for summer

In [250]:
 
july
aug
sep

Concatenate 12 calendar tables to one final table named calendar_df

In [183]:
calendars.keys()
Out[183]:
dict_keys(['cal_aug', 'cal_sep', 'cal_oct', 'cal_nov1', 'cal_nov15', 'cal_nov19', 'cal_dec', 'cal_jan'])
In [185]:
cal_list = list(calendars.keys())
cal_list
Out[185]:
['cal_aug',
 'cal_sep',
 'cal_oct',
 'cal_nov1',
 'cal_nov15',
 'cal_nov19',
 'cal_dec',
 'cal_jan']
In [189]:
calendar_df = calendars['cal_jan']
for i in cal_list[1:]:
    calendar_df = pd.concat([calendar_df, calendars[i]], axis = 0)
    
    
In [282]:
#calendar_df['listing_id'].value_counts().shape
In [190]:
calendar_df.shape
Out[190]:
(124919095, 9)

Checking missing values of calendar dataframe

In [191]:
calendar_df.isnull().sum()
Out[191]:
listing_id           0
date                 0
available            0
price             3060
adjusted_price    3060
minimum_nights    1580
maximum_nights    1580
weekday              0
weekdays             0
dtype: int64
In [ ]:
calendar_df[['weekdays','price']].groupby('weekdays').agg(lambda x: x.mean())
In [284]:
#airbnb_nomissing['id'].value_counts().shape

1. Concatenate 12 listings tabels into one final table named airbnb_df

In [254]:
summer_listings = pd.concat([names['la_july'], names['la_aug'],names['la_sep']], axis=0)
In [398]:
airbnb_training.columns
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-398-1dd27a55850f> in <module>
----> 1 airbnb_training.columns

NameError: name 'airbnb_training' is not defined
In [ ]:
summer_listings.head()
In [257]:
summer_listings = summer_listings[['id', 'host_is_superhost', 'host_total_listings_count', 'zipcode',
       'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'amenities', 'price', 'cleaning_fee', 'guests_included',
       'minimum_nights', 'maximum_nights', 'instant_bookable',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms']]
In [314]:
summer_merged['room_type'].value_counts()
Out[314]:
Entire home/apt    29040515
Private room       14740874
Shared room         1749296
Hotel room           141778
Name: room_type, dtype: int64
In [315]:
summer_merged.isnull().sum()
Out[315]:
id                                                    0
host_is_superhost                                 13469
host_total_listings_count                         13469
zipcode                                          558424
property_type                                         0
room_type                                             0
accommodates                                          0
bathrooms                                         21203
bedrooms                                          20839
beds                                              66339
amenities                                             0
price_x                                               0
cleaning_fee                                    5790018
guests_included                                       0
minimum_nights_x                                      0
maximum_nights_x                                      0
instant_bookable                                      0
calculated_host_listings_count_entire_homes           0
calculated_host_listings_count_private_rooms          0
calculated_host_listings_count_shared_rooms           0
listing_id                                            0
date                                                  0
available                                             0
price_y                                            1760
adjusted_price                                     1760
minimum_nights_y                                    225
maximum_nights_y                                    225
weekday                                               0
weekdays                                              0
dtype: int64
In [316]:
summer_merged.shape
Out[316]:
(45672463, 29)
In [318]:
summer_merged = summer_merged.dropna()
In [330]:
summer_merged['amenities'] = summer_merged['amenities'].apply(lambda x : len(x.split(",")))
In [ ]:
airbnb_subset = airbnb_partial_cleaned[airbnb_partial_cleaned.property_type
                                       .isin(['Apartment',"House",'Condominium','Guesthouse'])]
In [331]:
summer_merged['property_type'].value_counts()
Out[331]:
Apartment                 14350956
House                     13287680
Guesthouse                 2277313
Condominium                2270826
Guest suite                1465713
Townhouse                  1263415
Bungalow                   1161654
Loft                        864191
Villa                       847231
Serviced apartment          285558
Hostel                      210128
Cottage                     192201
Bed and breakfast           136412
Camper/RV                   131313
Boutique hotel              116025
Other                       105569
Tiny house                   83629
Cabin                        63882
Aparthotel                   55965
Hotel                        37037
Farm stay                    23023
Boat                         17836
Tent                         17108
Chalet                       14290
Castle                       13832
Treehouse                    12012
Earth house                   9282
Campsite                      8918
Dome house                    7280
Yurt                          6643
Barn                          5187
Tipi                          5096
Resort                        2184
Island                        2184
Hut                           2184
Bus                           2184
Casa particular (Cuba)        1638
Train                         1365
Igloo                         1092
Pension (South Korea)         1092
Dorm                           910
Cave                           728
Lighthouse                     546
Vacation home                   91
Name: property_type, dtype: int64
In [350]:
summer_merged_subset = summer_merged[summer_merged.property_type
                                       .isin(['Apartment',"House",'Condominium','Guesthouse'])]
In [348]:
summer_merged_drop_columns = ['price_x', 'guests_included',
       'minimum_nights_x', 'maximum_nights_x', 'listing_id','adjusted_price']
In [341]:
summer_merged_subset = summer_merged_subset.drop(columns = summer_merged_drop_columns)
In [346]:
summer_merged_subset.columns
Out[346]:
Index(['id', 'host_is_superhost', 'host_total_listings_count', 'zipcode',
       'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'amenities', 'cleaning_fee', 'instant_bookable',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'date', 'available',
       'price_y', 'adjusted_price', 'minimum_nights_y', 'maximum_nights_y',
       'weekday', 'weekdays'],
      dtype='object')
In [347]:
airbnb_training.columns
Out[347]:
Index(['id', 'host_is_superhost', 'host_total_listings_count', 'zipcode',
       'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'amenities', 'price', 'cleaning_fee', 'guests_included',
       'minimum_nights', 'maximum_nights', 'instant_bookable',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms'],
      dtype='object')
In [309]:
summer_cleaned.head()
Out[309]:
listing_id date available price adjusted_price minimum_nights maximum_nights weekday weekdays
0 26082 7 1 55.0 55.0 3.0 28.0 0 Monday
1 109 7 1 115.0 115.0 30.0 730.0 1 Tuesday
2 109 7 1 115.0 115.0 30.0 730.0 2 Wednesday
3 109 7 1 115.0 115.0 30.0 730.0 3 Thursday
4 109 7 1 115.0 115.0 30.0 730.0 4 Friday
In [311]:
summer_merged = summer_listings.merge(summer_cleaned, left_on = 'id', right_on = 'listing_id')
In [325]:
summer_cleaned[summer_cleaned['listing_id']==109].shape
Out[325]:
(364, 9)
In [328]:
summer_cleaned['date'].value_counts()
Out[328]:
8    5513464
7    5469156
9    5333698
Name: date, dtype: int64
In [324]:
summer_merged.head(100)
Out[324]:
id host_is_superhost host_total_listings_count zipcode property_type room_type accommodates bathrooms bedrooms beds amenities price_x cleaning_fee guests_included minimum_nights_x maximum_nights_x instant_bookable calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms listing_id date available price_y adjusted_price minimum_nights_y maximum_nights_y weekday weekdays
0 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 1 Tuesday
1 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 2 Wednesday
2 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 3 Thursday
3 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 4 Friday
4 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 5 Saturday
5 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 6 Sunday
6 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 0 Monday
7 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 1 Tuesday
8 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 2 Wednesday
9 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 3 Thursday
10 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 4 Friday
11 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 5 Saturday
12 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 6 Sunday
13 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 0 Monday
14 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 1 Tuesday
15 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 2 Wednesday
16 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 3 Thursday
17 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 4 Friday
18 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 5 Saturday
19 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 6 Sunday
20 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 0 Monday
21 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 1 Tuesday
22 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 2 Wednesday
23 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 3 Thursday
24 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 4 Friday
25 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 5 Saturday
26 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 6 Sunday
27 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 0 Monday
28 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 1 Tuesday
29 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 2 Wednesday
30 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 3 Thursday
31 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 4 Friday
32 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 5 Saturday
33 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 6 Sunday
34 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 0 Monday
35 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 1 Tuesday
36 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 2 Wednesday
37 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 3 Thursday
38 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 4 Friday
39 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 5 Saturday
40 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 6 Sunday
41 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 0 Monday
42 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 1 Tuesday
43 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 2 Wednesday
44 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 3 Thursday
45 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 4 Friday
46 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 5 Saturday
47 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 6 Sunday
48 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 0 Monday
49 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 1 Tuesday
50 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 2 Wednesday
51 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 3 Thursday
52 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 4 Friday
53 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 8 1 115.0 115.0 30.0 730.0 5 Saturday
54 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 6 Sunday
55 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 0 Monday
56 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 1 Tuesday
57 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 2 Wednesday
58 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 3 Thursday
59 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 4 Friday
60 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 5 Saturday
61 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 6 Sunday
62 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 0 Monday
63 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 1 Tuesday
64 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 2 Wednesday
65 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 3 Thursday
66 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 4 Friday
67 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 5 Saturday
68 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 6 Sunday
69 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 0 Monday
70 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 1 Tuesday
71 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 2 Wednesday
72 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 3 Thursday
73 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 4 Friday
74 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 5 Saturday
75 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 6 Sunday
76 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 0 Monday
77 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 1 Tuesday
78 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 2 Wednesday
79 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 3 Thursday
80 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 4 Friday
81 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 5 Saturday
82 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 6 Sunday
83 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 9 1 115.0 115.0 30.0 730.0 0 Monday
84 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 0 131.0 131.0 30.0 730.0 2 Wednesday
85 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 0 131.0 131.0 30.0 730.0 3 Thursday
86 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 0 131.0 131.0 30.0 730.0 4 Friday
87 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 0 131.0 131.0 30.0 730.0 5 Saturday
88 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 0 131.0 131.0 30.0 730.0 6 Sunday
89 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 0 131.0 131.0 30.0 730.0 0 Monday
90 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 0 131.0 131.0 30.0 730.0 1 Tuesday
91 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 1 Tuesday
92 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 2 Wednesday
93 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 3 Thursday
94 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 4 Friday
95 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 5 Saturday
96 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 6 Sunday
97 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 0 Monday
98 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 1 Tuesday
99 109 f 1.0 90230 Condominium Entire home/apt 6 2.0 2.0 3.0 {TV,"Cable TV",Internet,Wifi,"Air conditioning... $122.00 $240.00 3 30 730 f 1 0 0 109 7 1 115.0 115.0 30.0 730.0 2 Wednesday
In [ ]:
df1.merge(df2, left_on='lkey', right_on='rkey')
In [ ]:
 
In [279]:
keys_list = list(names.keys())


airbnb_df = names['la_jan']
for i in keys_list[1:]:
    airbnb_df = pd.concat([airbnb_df, names[i]], axis = 0)
In [280]:
airbnb_df.shape
Out[280]:
(606496, 106)
In [281]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [282]:
airbnb_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_df_original.csv')

Subset dataset only containing state of "CA, Ca, ca"

In [283]:
airbnb_df['state'].value_counts()
Out[283]:
CA                 605693
Ca                    534
ca                    103
加州                     34
NY                     29
加洲                     14
California             10
IL                      8
NV                      8
AZ                      5
GA                      5
TN                      5
Beverly Hills           4
LA                      2
Ny                      2
FL                      2
Los Angeles, CA         1
Name: state, dtype: int64
In [284]:
airbnb_ca = airbnb_df[airbnb_df['state'].isin(['CA','Ca','ca'])]
In [287]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [288]:
airbnb_ca.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_ca_only.csv")
In [285]:
print("The dataset before subsetting has %s" %airbnb_df.shape[0] + ' rows.')
print("The dateser after subsetting has %s" %airbnb_ca.shape[0] + ' rows.')
The dataset before subsetting has 606496 rows.
The dateser after subsetting has 606330 rows.
In [286]:
airbnb_ca.last_scraped.value_counts()
Out[286]:
2019-09-14    44976
2019-08-08    43001
2019-11-20    40881
2019-10-15    40681
2019-11-15    40154
2019-11-02    37944
2019-04-06    36192
2019-06-05    34326
2019-03-07    33954
2019-07-09    33778
2019-01-12    29177
2019-12-06    28196
2019-02-03    27625
2019-05-06    23140
2019-05-05    20751
2019-02-04    16033
2019-01-11    14153
2019-12-07    11913
2019-07-08    10782
2019-06-04    10143
2019-03-06     9180
2019-04-05     7184
2019-11-01     5065
2019-10-14     3684
2019-08-09     1918
2019-11-16     1478
2019-09-13        9
2019-11-21        5
2019-04-21        4
2019-05-07        2
2019-01-13        1
Name: last_scraped, dtype: int64

Why I did this?

In [ ]:
airbnb_ca = airbnb_ca[~airbnb_ca.last_scraped.isin(['2019-11-15','2019-11-20'])]
In [219]:
airbnb_ca.shape
Out[219]:
(525295, 106)
In [274]:
airbnb_ca.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-274-5f990b96ef75> in <module>
----> 1 airbnb_ca.head()

NameError: name 'airbnb_ca' is not defined
In [290]:
with_long_lat = airbnb_ca[columns_needed]
In [289]:
columns_needed = ['neighbourhood_cleansed','price','property_type','room_type','latitude',
       'longitude']
In [291]:
with_long_lat[['neighbourhood_cleansed','property_type','room_type']].apply(pd.Series.nunique, axis = 'rows')
Out[291]:
neighbourhood_cleansed    264
property_type              47
room_type                   4
dtype: int64
In [369]:
with_long_lat[['neighbourhood_cleansed','property_type','room_type']] = with_long_lat[['neighbourhood_cleansed','property_type','room_type']].astype('category')
/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:3069: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
In [370]:
property_type_table =  with_long_lat.property_type.value_counts().to_frame()
In [371]:
property_type_df = property_type_table.reset_index()
In [372]:
property_type_df 
Out[372]:
index property_type
0 Apartment 221812
1 House 207589
2 Condominium 34800
3 Guesthouse 31763
4 Guest suite 20588
5 Townhouse 19000
6 Bungalow 16870
7 Villa 13798
8 Loft 13198
9 Serviced apartment 4281
10 Hostel 4064
11 Cottage 2640
12 Bed and breakfast 2556
13 Camper/RV 2379
14 Boutique hotel 2309
15 Other 1921
16 Tiny house 1135
17 Cabin 1083
18 Aparthotel 963
19 Hotel 723
20 Farm stay 428
21 Tent 325
22 Boat 298
23 Castle 231
24 Earth house 214
25 Chalet 192
26 Treehouse 175
27 Dome house 146
28 Yurt 140
29 Campsite 114
30 Barn 110
31 Tipi 110
32 Dorm 72
33 Hut 52
34 Casa particular (Cuba) 45
35 Bus 39
36 Resort 38
37 Island 30
38 Cave 21
39 Minsu (Taiwan) 21
40 Train 19
41 Vacation home 11
42 Pension (South Korea) 9
43 Igloo 8
44 Nature lodge 4
45 Plane 3
46 Lighthouse 3
In [373]:
with_long_lat_subset= with_long_lat[with_long_lat.property_type
                                       .isin(['Apartment',"House",'Condominium','Guesthouse'])]
In [ ]:
 
In [374]:
with_long_lat_subset.loc[:,"property_type"] = with_long_lat_subset.loc[:,"property_type"].cat.remove_unused_categories()
/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py:1743: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
In [375]:
with_long_lat_nonas = with_long_lat_subset.dropna()
In [376]:
with_long_lat_nonas.shape
Out[376]:
(495964, 6)
In [378]:
with_long_lat_nonas.price = with_long_lat_nonas.price.astype(str).str.replace("$","").str.replace(",","").astype(float)
In [379]:
with_long_lat_nonas.head()
Out[379]:
neighbourhood_cleansed price property_type room_type latitude longitude
0 Culver City 122.0 Condominium Entire home/apt 33.982095 -118.384935
1 Burbank 168.0 House Entire home/apt 34.165616 -118.334582
2 Hollywood 79.0 Apartment Private room 34.097676 -118.346023
3 Santa Monica 140.0 Apartment Private room 34.004750 -118.481266
4 Bellflower 80.0 Apartment Entire home/apt 33.876189 -118.113968
In [380]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [381]:
with_long_lat_nonas.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/with_long_lat_nonas.csv")
In [299]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [300]:
property_type_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/property_type_df.csv')
In [ ]:
 
In [294]:
# some of these features might be having missing values, and some of them might be not releveant to 
#train machine learning model. Therefore, datacleaning is very necessary to prepare the data. 

#The strategy used for cleaning the data is as follows:
#1. Drop columns that are not relevant to our objective. For example: Experiences_offer, URL
#2. Find missing values for each column after being cleaned from step1. 
#3. Question here. what if some columns have so many missing values? which is more than 30% of the entire data?
#4. Covert columns to their correct data type. 
#5. 

Drop Columns

In [220]:
#check how data types are distributed. 
airbnb_ca.dtypes.value_counts()
Out[220]:
object     63
float64    22
int64      21
dtype: int64
In [221]:
airbnb_ca.select_dtypes(include = ['object']).columns
Out[221]:
Index(['listing_url', 'last_scraped', 'name', 'summary', 'space',
       'description', 'experiences_offered', 'neighborhood_overview', 'notes',
       'transit', 'access', 'interaction', 'house_rules', 'picture_url',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'is_location_exact',
       'property_type', 'room_type', 'bed_type', 'amenities', 'price',
       'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee',
       'extra_people', 'calendar_updated', 'has_availability',
       'calendar_last_scraped', 'first_review', 'last_review',
       'requires_license', 'license', 'jurisdiction_names', 'instant_bookable',
       'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification'],
      dtype='object')
In [222]:
drop_object_list = ['listing_url','last_scraped','name','summary','description','experiences_offered','picture_url',
                  'host_url','host_name', 'host_since', 'host_location', 'host_about',
                  'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
                  'street', 'neighbourhood','city', 'state','neighbourhood_group_cleansed','market','smart_location', 
                   'country_code', 'country','calendar_updated','calendar_last_scraped', 
                   'first_review', 'last_review',"jurisdiction_names",'is_business_travel_ready',
                   'interaction','access','transit','house_rules','cancellation_policy',
                  'host_has_profile_pic','host_verifications','require_guest_profile_picture', 
                    'require_guest_phone_verification','bed_type','security_deposit','requires_license',
                    'host_has_profile_pic',"host_identity_verified","extra_people",
                    'has_availability','is_location_exact','neighborhood_overview','zipcode','instant_bookable'
                   ]
In [ ]:
## export this drop object list as csv and send it to DD. 

Drop irrelevant float64 types

In [223]:
airbnb_ca.select_dtypes(include=["float64"]).columns
Out[223]:
Index(['thumbnail_url', 'medium_url', 'xl_picture_url', 'host_acceptance_rate',
       'host_listings_count', 'host_total_listings_count', 'latitude',
       'longitude', 'bathrooms', 'bedrooms', 'beds', 'square_feet',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'reviews_per_month'],
      dtype='object')
In [ ]:
 
In [224]:
#The reason that columns of latitude and longitude is not being dropped is they may be helpful when creating 
# visualizations. 


drop_float_list = ['thumbnail_url', 'medium_url', 'xl_picture_url',
                  'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
                   'host_listings_count','latitude',
       'longitude','host_total_listings_count'
                  ]

Drop irrelevant int64 types

In [55]:
#does availability_30 and etc mean how many days the listing would be available 
#within a year?

# calculated_host_listings_count. Do we need to keep this column? one-hot encoding is ready. 
In [56]:
# what d
In [225]:
airbnb_ca[['guests_included','room_type', 'calculated_host_listings_count_entire_homes',
          'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms']].head()
Out[225]:
guests_included room_type calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms
0 3 Entire home/apt 1 0 0
1 6 Entire home/apt 1 0 0
2 1 Private room 0 2 0
3 1 Private room 1 1 0
4 1 Entire home/apt 1 0 0
In [226]:
airbnb_ca.select_dtypes(include=["int64"]).columns
Out[226]:
Index(['id', 'scrape_id', 'host_id', 'accommodates', 'guests_included',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews',
       'number_of_reviews_ltm', 'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms'],
      dtype='object')
In [227]:
drop_int_list = ['host_id','scrape_id','minimum_minimum_nights','maximum_minimum_nights', 
                 'minimum_maximum_nights','maximum_maximum_nights','calculated_host_listings_count',
                "number_of_reviews","number_of_reviews_ltm",'availability_30', 'availability_60',
       'availability_90', 'availability_365','calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms','maximum_nights','guests_included']
In [290]:
#airbnb_ca[['calculated_host_listings_count','calculated_host_listings_count_entire_homes',
           #'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms']].head()
In [228]:
#combine all of the columns which are supposed to be dropped. 
drop_columns = drop_object_list + drop_float_list + drop_int_list
print(drop_columns)
['listing_url', 'last_scraped', 'name', 'summary', 'description', 'experiences_offered', 'picture_url', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'street', 'neighbourhood', 'city', 'state', 'neighbourhood_group_cleansed', 'market', 'smart_location', 'country_code', 'country', 'calendar_updated', 'calendar_last_scraped', 'first_review', 'last_review', 'jurisdiction_names', 'is_business_travel_ready', 'interaction', 'access', 'transit', 'house_rules', 'cancellation_policy', 'host_has_profile_pic', 'host_verifications', 'require_guest_profile_picture', 'require_guest_phone_verification', 'bed_type', 'security_deposit', 'requires_license', 'host_has_profile_pic', 'host_identity_verified', 'extra_people', 'has_availability', 'is_location_exact', 'neighborhood_overview', 'zipcode', 'instant_bookable', 'thumbnail_url', 'medium_url', 'xl_picture_url', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'host_listings_count', 'latitude', 'longitude', 'host_total_listings_count', 'host_id', 'scrape_id', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'calculated_host_listings_count', 'number_of_reviews', 'number_of_reviews_ltm', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'maximum_nights', 'guests_included']
In [229]:
print("Dropping %s columns" %len(drop_columns))
airbnb_ca_cleaned = airbnb_ca.drop(columns = drop_columns)
print("Shape fo the dataset after dropping : " , airbnb_ca_cleaned.shape)
Dropping 78 columns
Shape fo the dataset after dropping :  (525295, 30)

missing value percentage for each column statistics

In [230]:
missing_stats = airbnb_ca_cleaned.isnull().sum().to_frame()
In [231]:
missing_stats.shape
Out[231]:
(30, 1)
In [232]:
missing_stats.columns=["missing_num"]
In [233]:
missing_stats["missing_pctg"] = round(100* missing_stats["missing_num"] / airbnb_ca_cleaned.shape[0])
In [234]:
missing_stats.sort_values(by = "missing_num", ascending  =False, inplace = True)
In [235]:
display(missing_stats)
missing_num missing_pctg
host_acceptance_rate 525295 100.0
square_feet 521229 99.0
license 495267 94.0
monthly_price 470712 90.0
weekly_price 465873 89.0
notes 260606 50.0
space 136354 26.0
host_response_time 111340 21.0
host_response_rate 111340 21.0
review_scores_value 109166 21.0
review_scores_location 109071 21.0
review_scores_checkin 109006 21.0
review_scores_communication 108724 21.0
review_scores_accuracy 108675 21.0
review_scores_cleanliness 108668 21.0
review_scores_rating 108454 21.0
reviews_per_month 101691 19.0
cleaning_fee 71236 14.0
beds 638 0.0
host_is_superhost 587 0.0
bedrooms 412 0.0
bathrooms 292 0.0
price 0 0.0
amenities 0 0.0
minimum_nights 0 0.0
accommodates 0 0.0
room_type 0 0.0
property_type 0 0.0
neighbourhood_cleansed 0 0.0
id 0 0.0
In [236]:
drop_missing_values = list(missing_stats[missing_stats['missing_pctg'] >= 19].index)
print(drop_missing_values)
['host_acceptance_rate', 'square_feet', 'license', 'monthly_price', 'weekly_price', 'notes', 'space', 'host_response_time', 'host_response_rate', 'review_scores_value', 'review_scores_location', 'review_scores_checkin', 'review_scores_communication', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_rating', 'reviews_per_month']
In [237]:
airbnb_partial_cleaned = airbnb_ca_cleaned.drop(columns= drop_missing_values)
In [238]:
airbnb_partial_cleaned.shape
Out[238]:
(525295, 13)

Convert data type

1.some columns with object type should be changed to categorical type so as to run one-hot encoding when training the model

In [239]:
airbnb_partial_cleaned['price'] = airbnb_partial_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)

airbnb_partial_cleaned['cleaning_fee'] = airbnb_partial_cleaned['cleaning_fee'].astype(str).str.replace("$","").str.replace(",","").astype(float)
In [240]:
airbnb_partial_cleaned.columns
Out[240]:
Index(['id', 'host_is_superhost', 'neighbourhood_cleansed', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'amenities', 'price', 'cleaning_fee', 'minimum_nights'],
      dtype='object')
In [241]:
# which columns should be changed to categorical?

airbnb_partial_cleaned.select_dtypes(include = "object").head()
Out[241]:
host_is_superhost neighbourhood_cleansed property_type room_type amenities
0 f Culver City Condominium Entire home/apt {TV,"Cable TV",Internet,Wifi,"Air conditioning...
1 f Burbank House Entire home/apt {TV,"Cable TV",Internet,Wifi,"Air conditioning...
2 t Hollywood Apartment Private room {Internet,Wifi,"Air conditioning","Wheelchair ...
3 f Santa Monica Apartment Private room {Internet,Wifi,Kitchen,Heating,Washer,Dryer,"S...
4 f Bellflower Apartment Entire home/apt {TV,"Cable TV",Internet,Wifi,"Air conditioning...
In [242]:
airbnb_partial_cleaned.select_dtypes(include = 'object').columns
Out[242]:
Index(['host_is_superhost', 'neighbourhood_cleansed', 'property_type',
       'room_type', 'amenities'],
      dtype='object')
In [162]:
categorical_cols = ['host_is_superhost', 'property_type','room_type', 
                     'neighbourhood_cleansed'                
]
In [244]:
airbnb_partial_cleaned[categorical_cols].dtypes
Out[244]:
host_is_superhost         object
property_type             object
room_type                 object
neighbourhood_cleansed    object
dtype: object
In [245]:
airbnb_partial_cleaned[categorical_cols] = airbnb_partial_cleaned[categorical_cols].apply(lambda x : x.astype('category'),
                                                                         axis = 'rows')
In [246]:
airbnb_partial_cleaned[categorical_cols].dtypes
Out[246]:
host_is_superhost         category
property_type             category
room_type                 category
neighbourhood_cleansed    category
dtype: object
In [247]:
airbnb_partial_cleaned.host_is_superhost = airbnb_partial_cleaned.host_is_superhost.map(dict(f=0,t=1))
In [ ]:
airbnb_partial_cleaned.host_is_superhost = airbnb_partial_cleaned.host_is_superhost.map(dict(f=0,t=1))
In [405]:
airbnb_partial_cleaned.shape
Out[405]:
(525295, 13)

2. Filter data

1.Find out how many unqiue values each of these columns have.

In [248]:
num_unique_values  = airbnb_partial_cleaned[categorical_cols].apply(pd.Series.nunique, axis = 'rows')
num_unique_values
Out[248]:
host_is_superhost           2
property_type              47
room_type                   4
neighbourhood_cleansed    264
dtype: int64
In [204]:
num_unique_values.plot(kind = 'bar')
plt.xlabel('labels')
plt.ylabel('Number of unique values')
plt.show()
In [106]:
#airbnb_partial_cleaned.to_csv("partial_cleaned_listings_without_subsetting.csv", index = False)
In [249]:
airbnb_subset = airbnb_partial_cleaned[airbnb_partial_cleaned.property_type
                                       .isin(['Apartment',"House",'Condominium','Guesthouse'])]
In [ ]:
 
In [250]:
airbnb_subset.shape
Out[250]:
(430014, 13)

2. Drop those unused categories of property types.

In [251]:
airbnb_subset.loc[:,"property_type"] = airbnb_subset.loc[:,"property_type"].cat.remove_unused_categories()
/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py:1743: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
In [252]:
airbnb_subset['property_type'].value_counts()
Out[252]:
Apartment      193271
House          179303
Condominium     30028
Guesthouse      27412
Name: property_type, dtype: int64

3. Convert amenities to the number of elements included in amenities for each observation.

In [253]:
airbnb_subset['amenities'] = airbnb_subset['amenities'].apply(lambda x : len(x.split(",")))
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

4. Missing Values Statistics for the subset

In [254]:
subset_missing = airbnb_subset.isnull().sum().to_frame()
subset_missing.columns = ["missing_values"]
subset_missing['percentage'] =(100 * subset_missing['missing_values'] / airbnb_subset.shape[0])
In [255]:
subset_missing
Out[255]:
missing_values percentage
id 0 0.000000
host_is_superhost 494 0.114880
neighbourhood_cleansed 0 0.000000
property_type 0 0.000000
room_type 0 0.000000
accommodates 0 0.000000
bathrooms 251 0.058370
bedrooms 380 0.088369
beds 404 0.093950
amenities 0 0.000000
price 0 0.000000
cleaning_fee 58967 13.712809
minimum_nights 0 0.000000

Check if columns contain host are having missing values for same observations.

In [212]:
cols_missing_values = list(subset_missing[subset_missing.missing_values > 0].index)
airbnb_subset_missing_values = airbnb_subset[cols_missing_values]
In [213]:
host_cols = list(airbnb_subset_missing_values.columns[airbnb_subset_missing_values.columns.str.contains('host')])
In [203]:
host_cols
Out[203]:
['host_is_superhost', 'host_total_listings_count']
In [204]:
airbnb_subset_missing_values[host_cols][airbnb_subset_missing_values.host_is_superhost.isnull()].shape
Out[204]:
(523, 2)

The result above proves that those three host_cols have same missing values for the same observations.

Delete all of those missing values and to see how many observations left.

In [256]:
airbnb_nomissing = airbnb_subset.dropna()
In [257]:
airbnb_nomissing.head()
Out[257]:
id host_is_superhost neighbourhood_cleansed property_type room_type accommodates bathrooms bedrooms beds amenities price cleaning_fee minimum_nights
0 109 0 Culver City Condominium Entire home/apt 6 2.0 2.0 3.0 32 122.0 240.0 7
1 344 0 Burbank House Entire home/apt 6 1.0 3.0 3.0 41 168.0 100.0 2
2 2708 1 Hollywood Apartment Private room 1 1.5 1.0 1.0 43 79.0 85.0 6
3 2732 0 Santa Monica Apartment Private room 1 1.0 1.0 1.0 12 140.0 100.0 1
4 2864 0 Bellflower Apartment Entire home/apt 2 1.0 1.0 1.0 20 80.0 75.0 2
In [258]:
airbnb_nomissing.columns
Out[258]:
Index(['id', 'host_is_superhost', 'neighbourhood_cleansed', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'amenities', 'price', 'cleaning_fee', 'minimum_nights'],
      dtype='object')
In [259]:
print("After dropping all of missing values for each column, the dataset named airbnb_nomissing \n has %s" % airbnb_nomissing.shape[0] 
      + " rows."
     )
After dropping all of missing values for each column, the dataset named airbnb_nomissing 
 has 370236 rows.
In [ ]:
a

3. One hot encoding categorical variables.

Keep latitude and longitude columns for creating visualizations. So assign a new dataset without latitude and longitude to airbnb_training.

In [410]:
#airbnb_training = airbnb_nomissing.drop(['latitude','longitude'], axis = 'columns')
In [51]:
#if "latitude" in airbnb_training.columns:
#    print('yes')
#else:
 #   print('no')
In [260]:
airbnb_la = airbnb_nomissing
In [92]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [262]:
airbnb_la.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_la.csv', index = False)
In [149]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/data_prep
In [21]:
airbnb_la =  pd.read_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_la.csv")
In [22]:
categorical_cols = ['host_is_superhost', 'property_type','room_type', 
                     'neighbourhood_cleansed'                
]
In [20]:
airbnb_la.dtypes
Out[20]:
id                          int64
host_is_superhost           int64
neighbourhood_cleansed     object
property_type              object
room_type                  object
accommodates                int64
bathrooms                 float64
bedrooms                  float64
beds                      float64
amenities                   int64
price                     float64
cleaning_fee              float64
minimum_nights              int64
dtype: object
In [231]:
airbnb_la.property_type.value_counts()
Out[231]:
Apartment      165714
House          152971
Condominium     26074
Guesthouse      25477
Name: property_type, dtype: int64
In [235]:
airbnb_la[airbnb_la['room_type'] == 'Hotel room']
Out[235]:
id host_is_superhost neighbourhood_cleansed property_type room_type accommodates bathrooms bedrooms beds amenities price cleaning_fee minimum_nights
277461 36761030 0 Koreatown House Hotel room 16 1.5 1.0 10.0 20 25.0 10.0 1
277507 36789766 0 Koreatown House Hotel room 16 8.5 1.0 0.0 20 28.0 10.0 1
In [237]:
airbnb_la_no_hotel =airbnb_la.drop([277461,277507])
In [234]:
airbnb_la.room_type.head()
Out[234]:
0    Entire home/apt
1    Entire home/apt
2       Private room
3       Private room
4    Entire home/apt
Name: room_type, dtype: category
Categories (4, object): ['Entire home/apt', 'Hotel room', 'Private room', 'Shared room']
In [256]:
categorical_cols
Out[256]:
['host_is_superhost', 'property_type', 'room_type', 'neighbourhood_cleansed']
In [257]:
airbnb_la_no_hotel[categorical_cols] = airbnb_la_no_hotel[categorical_cols].astype('category')
In [260]:
no_hotel_onehot = pd.get_dummies(airbnb_la_no_hotel[categorical_cols])
In [262]:
airbnb_la_no_hotel.dtypes
Out[262]:
id                           int64
host_is_superhost         category
neighbourhood_cleansed    category
property_type             category
room_type                 category
accommodates                 int64
bathrooms                  float64
bedrooms                   float64
beds                       float64
amenities                    int64
price                      float64
cleaning_fee               float64
minimum_nights               int64
dtype: object
In [240]:
no_hotel_onehot.shape
Out[240]:
(370234, 272)
In [263]:
airbnb_nohotel_dummies = pd.concat([no_hotel_onehot, airbnb_la_no_hotel], axis = 'columns').drop(categorical_cols,
                                                                                           axis = 'columns')
In [266]:
airbnb_nohotel_dummies = airbnb_nohotel_dummies.drop(['id',axis = 1)
In [265]:
airbnb_nohotel_dummies.head()
Out[265]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills id accommodates bathrooms bedrooms beds amenities cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 109 6 2.0 2.0 3.0 32 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 344 6 1.0 3.0 3.0 41 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 2708 1 1.5 1.0 1.0 43 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 2732 1 1.0 1.0 1.0 12 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 2864 2 1.0 1.0 1.0 20 75.0 2

5 rows × 280 columns

In [267]:
no_hotel_price = airbnb_la_no_hotel['price']
In [268]:
no_hotel_ols = stat.OLS(no_hotel_price, airbnb_nohotel_dummies)
no_hotel_ols_result = no_hotel_ols.fit()
In [269]:
print(no_hotel_ols_result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.426
Model:                            OLS   Adj. R-squared:                  0.426
Method:                 Least Squares   F-statistic:                     1002.
Date:                Sun, 06 Dec 2020   Prob (F-statistic):               0.00
Time:                        16:22:09   Log-Likelihood:            -2.6087e+06
No. Observations:              370234   AIC:                         5.218e+06
Df Residuals:                  369959   BIC:                         5.221e+06
Df Model:                         274                                         
Covariance Type:            nonrobust                                         
================================================================================================================================
                                                                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------------
host_is_superhost_0                                          -1.621e+12   1.51e+12     -1.071      0.284   -4.59e+12    1.35e+12
host_is_superhost_1                                          -1.621e+12   1.51e+12     -1.071      0.284   -4.59e+12    1.35e+12
property_type_Apartment                                        5.28e+11   4.93e+11      1.071      0.284   -4.39e+11    1.49e+12
property_type_Condominium                                      5.28e+11   4.93e+11      1.071      0.284   -4.39e+11    1.49e+12
property_type_Guesthouse                                       5.28e+11   4.93e+11      1.071      0.284   -4.39e+11    1.49e+12
property_type_House                                            5.28e+11   4.93e+11      1.071      0.284   -4.39e+11    1.49e+12
room_type_Entire home/apt                                     1.111e+12   1.04e+12      1.071      0.284   -9.23e+11    3.15e+12
room_type_Hotel room                                         -2.232e+10   2.09e+10     -1.071      0.284   -6.32e+10    1.85e+10
room_type_Private room                                        1.111e+12   1.04e+12      1.071      0.284   -9.23e+11    3.15e+12
room_type_Shared room                                         1.111e+12   1.04e+12      1.071      0.284   -9.23e+11    3.15e+12
neighbourhood_cleansed_Acton                                 -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Adams-Normandie                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Agoura Hills                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Agua Dulce                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Alhambra                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Alondra Park                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Altadena                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Angeles Crest                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Arcadia                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Arleta                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Arlington Heights                     -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Artesia                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Athens                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Atwater Village                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Avalon                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Avocado Heights                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Azusa                                 -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Baldwin Hills/Crenshaw                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Baldwin Park                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Bel-Air                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Bell                                  -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Bell Gardens                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Bellflower                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Beverly Crest                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Beverly Grove                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Beverly Hills                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Beverlywood                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Boyle Heights                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Bradbury                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Brentwood                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Broadway-Manchester                   -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Burbank                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Calabasas                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Canoga Park                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Carson                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Carthay                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Castaic                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Castaic Canyons                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Central-Alameda                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Century City                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Cerritos                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Charter Oak                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Chatsworth                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Chatsworth Reservoir                  -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Chesterfield Square                   -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Cheviot Hills                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Chinatown                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Citrus                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Claremont                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Commerce                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Compton                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Covina                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Cudahy                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Culver City                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Cypress Park                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Del Aire                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Del Rey                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Diamond Bar                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Downey                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Downtown                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Duarte                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Eagle Rock                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_East Hollywood                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_East Los Angeles                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_East Pasadena                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_East San Gabriel                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_East Whittier                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Echo Park                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_El Monte                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_El Segundo                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_El Sereno                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Elysian Park                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Elysian Valley                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Encino                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Exposition Park                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Fairfax                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Florence                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Florence-Firestone                    -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Gardena                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Glassell Park                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Glendale                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Glendora                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Gramercy Park                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Granada Hills                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Green Meadows                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Green Valley                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Griffith Park                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hacienda Heights                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hancock Park                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Harbor City                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Harbor Gateway                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Harvard Heights                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Harvard Park                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hasley Canyon                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hawaiian Gardens                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hawthorne                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hermosa Beach                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Highland Park                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Historic South-Central                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hollywood                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hollywood Hills                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hollywood Hills West                  -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Huntington Park                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Hyde Park                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Industry                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Inglewood                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Irwindale                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Jefferson Park                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Koreatown                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_La Canada Flintridge                  -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_La Crescenta-Montrose                 -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_La Habra Heights                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_La Mirada                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_La Puente                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_La Verne                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Ladera Heights                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lake Balboa                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lake Hughes                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lake Los Angeles                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lake View Terrace                     -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lakewood                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lancaster                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Larchmont                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lawndale                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Leimert Park                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lennox                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Leona Valley                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lincoln Heights                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lomita                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Long Beach                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lopez/Kagel Canyons                   -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Los Feliz                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Lynwood                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Malibu                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Manchester Square                     -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Manhattan Beach                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Mar Vista                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Marina del Rey                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Mayflower Village                     -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Maywood                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Mid-City                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Mid-Wilshire                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Mission Hills                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Monrovia                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Montebello                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Montecito Heights                     -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Monterey Park                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Mount Washington                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_North El Monte                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_North Hills                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_North Hollywood                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_North Whittier                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Northeast Antelope Valley             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Northridge                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Northwest Antelope Valley             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Northwest Palmdale                    -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Norwalk                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Pacific Palisades                     -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Pacoima                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Palmdale                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Palms                                 -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Palos Verdes Estates                  -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Panorama City                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Paramount                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Pasadena                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Pico Rivera                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Pico-Robertson                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Pico-Union                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Playa Vista                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Playa del Rey                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Pomona                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Porter Ranch                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Quartz Hill                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Rancho Dominguez                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Rancho Palos Verdes                   -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Rancho Park                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Redondo Beach                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Reseda                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Ridge Route                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Rolling Hills                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Rolling Hills Estates                 -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Rosemead                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Rowland Heights                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_San Dimas                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_San Fernando                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_San Gabriel                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_San Marino                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_San Pasqual                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_San Pedro                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Santa Clarita                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Santa Fe Springs                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Santa Monica                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Sawtelle                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Sepulveda Basin                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Shadow Hills                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Sherman Oaks                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Sierra Madre                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Signal Hill                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Silver Lake                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_South Diamond Bar                     -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_South El Monte                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_South Gate                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_South Park                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_South Pasadena                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_South San Gabriel                     -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_South San Jose Hills                  -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_South Whittier                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Southeast Antelope Valley             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Stevenson Ranch                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Studio City                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Sun Valley                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Sun Village                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Sunland                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Sylmar                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Tarzana                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Temple City                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Toluca Lake                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Topanga                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Torrance                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Tujunga                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Tujunga Canyons                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Unincorporated Catalina Island        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Unincorporated Santa Monica Mountains -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Unincorporated Santa Susana Mountains -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Universal City                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_University Park                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Val Verde                             -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Valinda                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Valley Glen                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Valley Village                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Van Nuys                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Venice                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Vermont Knolls                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Vermont Square                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Vermont Vista                         -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Vermont-Slauson                       -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Vernon                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Veterans Administration               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_View Park-Windsor Hills               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Vincent                               -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Walnut                                -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Watts                                 -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_West Adams                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_West Carson                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_West Compton                          -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_West Covina                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_West Hills                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_West Hollywood                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_West Los Angeles                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_West Puente Valley                    -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_West Whittier-Los Nietos              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Westchester                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Westlake                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Westlake Village                      -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Westmont                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Westwood                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Whittier                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Willowbrook                           -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Wilmington                            -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Windsor Square                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Winnetka                              -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
neighbourhood_cleansed_Woodland Hills                        -1.822e+10    1.7e+10     -1.071      0.284   -5.16e+10    1.51e+10
accommodates                                                     5.2554      0.366     14.363      0.000       4.538       5.973
bathrooms                                                      138.7907      0.886    156.562      0.000     137.053     140.528
bedrooms                                                        17.5189      0.858     20.412      0.000      15.837      19.201
beds                                                           -13.0639      0.499    -26.198      0.000     -14.041     -12.087
amenities                                                       -0.6855      0.042    -16.472      0.000      -0.767      -0.604
cleaning_fee                                                     1.1991      0.008    152.837      0.000       1.184       1.214
minimum_nights                                                  -0.4987      0.022    -22.186      0.000      -0.543      -0.455
==============================================================================
Omnibus:                   781777.748   Durbin-Watson:                   1.881
Prob(Omnibus):                  0.000   Jarque-Bera (JB):       5334194299.280
Skew:                          18.324   Prob(JB):                         0.00
Kurtosis:                     589.890   Cond. No.                     1.90e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.61e-23. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [23]:
airbnb_la[categorical_cols] = airbnb_la[categorical_cols].astype('category')
In [24]:
category_one_hot = pd.get_dummies(airbnb_la[categorical_cols])
In [25]:
category_one_hot.shape
Out[25]:
(370236, 272)
In [165]:
category_one_hot.head()
Out[165]:
host_is_superhost property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room neighbourhood_cleansed_Acton ... neighbourhood_cleansed_Westlake neighbourhood_cleansed_Westlake Village neighbourhood_cleansed_Westmont neighbourhood_cleansed_Westwood neighbourhood_cleansed_Whittier neighbourhood_cleansed_Willowbrook neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills
0 0 0 1 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 1 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 1 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 1 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 271 columns

In [26]:
airbnb_with_dummies = pd.concat([category_one_hot, airbnb_la], axis = 'columns').drop(categorical_cols,
                                                                                           axis = 'columns')
In [27]:
airbnb_with_dummies.shape
Out[27]:
(370236, 281)
In [28]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips

Up to this point, the data is ready to train the model.

In [29]:
airbnb_with_dummies.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_with_dummies.csv", index = False)
In [363]:
airbnb_with_dummies.head()
Out[363]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Woodland Hills id accommodates bathrooms bedrooms beds amenities price cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 109 6 2.0 2.0 3.0 32 122.0 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 344 6 1.0 3.0 3.0 41 168.0 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 2708 1 1.5 1.0 1.0 43 79.0 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 2732 1 1.0 1.0 1.0 12 140.0 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 2864 2 1.0 1.0 1.0 20 80.0 75.0 2

5 rows × 281 columns

In [ ]:
### adding month columns 
###correlation matrix 
### atrributes affect the price. 
In [178]:
len(airbnb_la.columns)
Out[178]:
13
In [404]:
airbnb_la.columns
Out[404]:
Index(['id', 'host_is_superhost', 'neighbourhood_cleansed', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'amenities', 'price', 'cleaning_fee', 'minimum_nights'],
      dtype='object')
In [229]:
airbnb_la['room_type'].value_counts()
Out[229]:
Entire home/apt    247856
Private room       110943
Shared room         11435
Hotel room              2
Name: room_type, dtype: int64
In [508]:
airbnb_la.groupby('room_type').agg({'price':mean})
Out[508]:
price
room_type
Entire home/apt 241.724279
Hotel room 26.500000
Private room 78.101881
Shared room 51.922868
In [303]:
airbnb_corr = airbnb_la.drop(['id','price'],axis =1)
In [180]:
import seaborn as sns
In [304]:
corr = airbnb_corr.corr()
In [318]:
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
In [ ]:
, mask=np.zeros_like(corr, dtype=np.bool)
sns.heatmap(corr, annot = True, cmap=sns.diverging_palette(230, 20, as_cmap=True),
            square=True, ax=ax)
In [311]:
sns.heatmap(corr, annot=True)
plot.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-311-3dbc78861643> in <module>
      1 sns.heatmap(corr, annot=True)
----> 2 plot.show()

NameError: name 'plot' is not defined
In [337]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [367]:
f, ax = plt.subplots(figsize=(9, 8))
corr = airbnb_corr.corr()

sns.heatmap(corr, annot = True, annot_kws={"size":13}, fmt=".2f",cmap=sns.diverging_palette(220, 10, as_cmap=True))
ax.set_ylim(np.array([-0.5, -.5])+ax.get_ylim())
plt.xticks(fontsize=13, rotation=45)
plt.yticks(fontsize=13)
plt.savefig('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/correlation_matrix2.jpeg',bbox_inches='tight')
In [321]:
sns.heatmap(airbnb_corr.corr(),mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
Out[321]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ffc115efa50>
In [ ]:
 
In [17]:
airbnb_la = pd.read_csv("airbnb_la.csv")
In [68]:
# Concatenate two tables together. 
airbnb_with_dummies = pd.concat([category_one_hot, airbnb_la], axis = 'columns').drop(categorical_cols,
                                                                                           axis = 'columns')
In [499]:
airbnb_la.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_la.csv', index = False)
In [497]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [69]:
airbnb_with_dummies.shape
Out[69]:
(370236, 281)
In [179]:
airbnb_with_dummies.columns
Out[179]:
Index(['host_is_superhost_0', 'host_is_superhost_1', 'property_type_Apartment',
       'property_type_Condominium', 'property_type_Guesthouse',
       'property_type_House', 'room_type_Entire home/apt',
       'room_type_Hotel room', 'room_type_Private room',
       'room_type_Shared room',
       ...
       'neighbourhood_cleansed_Woodland Hills', 'id', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'amenities', 'price', 'cleaning_fee',
       'minimum_nights'],
      dtype='object', length=281)

4. Applying 10 fold cross validations on dataset.

Linear Regression

In [282]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
In [70]:
airbnb_with_dummies.head()
Out[70]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Woodland Hills id accommodates bathrooms bedrooms beds amenities price cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 109 6 2.0 2.0 3.0 32 122.0 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 344 6 1.0 3.0 3.0 41 168.0 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 2708 1 1.5 1.0 1.0 43 79.0 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 2732 1 1.0 1.0 1.0 12 140.0 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 2864 2 1.0 1.0 1.0 20 80.0 75.0 2

5 rows × 281 columns

In [204]:
airbnb_with_dummies.head()
Out[204]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Woodland Hills id accommodates bathrooms bedrooms beds amenities price cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 109 6 2.0 2.0 3.0 32 122.0 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 344 6 1.0 3.0 3.0 41 168.0 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 2708 1 1.5 1.0 1.0 43 79.0 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 2732 1 1.0 1.0 1.0 12 140.0 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 2864 2 1.0 1.0 1.0 20 80.0 75.0 2

5 rows × 281 columns

The amenities are retained for model trainig.

In [88]:
airbnb_extra_columns = ['price','id','accommodates']
drop_extra_columns = airbnb_with_dummies.drop(airbnb_extra_columns,axis = 1)
In [ ]:
 
In [ ]:
### check the number of hotal rooms 
In [89]:
drop_extra_columns.head()
Out[89]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills bathrooms bedrooms beds amenities cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 2.0 2.0 3.0 32 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 1.0 3.0 3.0 41 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 0 1.5 1.0 1.0 43 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 1.0 1.0 1.0 12 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 0 1.0 1.0 1.0 20 75.0 2

5 rows × 278 columns

In [ ]:
airbnb_no_beds_columns = ['price','id','beds',]

this is x_train without scaling.

In [90]:
noscale_x_train = drop_extra_columns
In [86]:
if 'accommodates'.isin([noscale_x_train.columns]):
    print('yes')
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-86-7ae7b87ff4d6> in <module>
----> 1 if 'accommodates'.isin([noscale_x_train.columns]):
      2     print('yes')

AttributeError: 'str' object has no attribute 'isin'
In [33]:
print(os.getcwd())
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [34]:
noscale_x_train.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/noscale_x_train_without_price_id_accommodates.csv")
In [74]:
noscale_x_train.head()
Out[74]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills accommodates bathrooms bedrooms beds amenities cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 6 2.0 2.0 3.0 32 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 6 1.0 3.0 3.0 41 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 1 1.5 1.0 1.0 43 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 1 1.0 1.0 1.0 12 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 2 1.0 1.0 1.0 20 75.0 2

5 rows × 279 columns

In [285]:
x_train= drop_extra_columns
In [286]:
x_train.head()
Out[286]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Willowbrook neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills bathrooms bedrooms beds cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 0 0.648862 0.489356 0.581388 1.753823 0.029870
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 0 -0.514918 1.398624 0.581388 0.151685 -0.210491
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 0.066972 -0.419912 -0.625181 -0.019973 -0.018203
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 -0.514918 -0.419912 -0.625181 0.151685 -0.258563
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 0 0 -0.514918 -0.419912 -0.625181 -0.134411 -0.210491

5 rows × 277 columns

In [220]:
y_train= airbnb_with_dummies['price']
In [279]:
import statsmodels.api as stat
In [281]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
In [294]:
noscale_x_train.head()
Out[294]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Willowbrook neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills bathrooms bedrooms beds cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 0 2.0 2.0 3.0 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 0 1.0 3.0 3.0 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 1.5 1.0 1.0 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 1.0 1.0 1.0 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 0 0 1.0 1.0 1.0 75.0 2

5 rows × 277 columns

In [322]:
noscale_x_train.columns
Out[322]:
Index(['host_is_superhost_0', 'host_is_superhost_1', 'property_type_Apartment',
       'property_type_Condominium', 'property_type_Guesthouse',
       'property_type_House', 'room_type_Entire home/apt',
       'room_type_Hotel room', 'room_type_Private room',
       'room_type_Shared room',
       ...
       'neighbourhood_cleansed_Willowbrook',
       'neighbourhood_cleansed_Wilmington',
       'neighbourhood_cleansed_Windsor Square',
       'neighbourhood_cleansed_Winnetka',
       'neighbourhood_cleansed_Woodland Hills', 'bathrooms', 'bedrooms',
       'beds', 'cleaning_fee', 'minimum_nights'],
      dtype='object', length=277)
In [ ]:
X = stat.add_constant(X, prepend=False)
In [367]:
noscale_x_train.head()
Out[367]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills bathrooms bedrooms beds cleaning_fee minimum_nights const
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 2.0 2.0 3.0 240.0 7 1.0
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 1.0 3.0 3.0 100.0 2 1.0
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 0 1.5 1.0 1.0 85.0 6 1.0
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 1.0 1.0 1.0 100.0 1 1.0
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 0 1.0 1.0 1.0 75.0 2 1.0

5 rows × 278 columns

In [364]:
noscale_x_train = stat.add_constant(noscale_x_train, prepend = False)
In [382]:
noscale_x_train_1 = noscale_x_train.drop(['host_is_superhost_0','property_type_Apartment','room_type_Hotel room'], axis = 1)
In [383]:
ols2 = stat.OLS(y_train, noscale_x_train_1)
ols2_result = ols2.fit()
In [384]:
print(ols2_result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.425
Model:                            OLS   Adj. R-squared:                  0.425
Method:                 Least Squares   F-statistic:                     1003.
Date:                Fri, 04 Dec 2020   Prob (F-statistic):               0.00
Time:                        14:07:51   Log-Likelihood:            -2.6089e+06
No. Observations:              370236   AIC:                         5.218e+06
Df Residuals:                  369962   BIC:                         5.221e+06
Df Model:                         273                                         
Covariance Type:            nonrobust                                         
================================================================================================================================
                                                                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------------
host_is_superhost_1                                            -12.9393      1.027    -12.596      0.000     -14.953     -10.926
property_type_Condominium                                      -18.0407      1.915     -9.420      0.000     -21.794     -14.287
property_type_Guesthouse                                        36.8097      2.027     18.163      0.000      32.838      40.782
property_type_House                                              3.6489      1.272      2.869      0.004       1.156       6.142
room_type_Entire home/apt                                      520.3981    196.729      2.645      0.008     134.816     905.980
room_type_Private room                                         532.4358    196.727      2.706      0.007     146.856     918.016
room_type_Shared room                                          452.7089    196.734      2.301      0.021      67.116     838.302
neighbourhood_cleansed_Acton                                  -106.5647     80.045     -1.331      0.183    -263.450      50.321
neighbourhood_cleansed_Adams-Normandie                          -5.4848     12.178     -0.450      0.652     -29.353      18.383
neighbourhood_cleansed_Agoura Hills                            -32.9063     15.577     -2.112      0.035     -63.438      -2.375
neighbourhood_cleansed_Agua Dulce                              -46.1263     31.897     -1.446      0.148    -108.643      16.391
neighbourhood_cleansed_Alhambra                                -16.2429      6.190     -2.624      0.009     -28.375      -4.111
neighbourhood_cleansed_Alondra Park                             -8.9109     28.553     -0.312      0.755     -64.875      47.053
neighbourhood_cleansed_Altadena                                 -9.0006      6.845     -1.315      0.189     -22.417       4.415
neighbourhood_cleansed_Angeles Crest                            38.3854    138.565      0.277      0.782    -233.198     309.969
neighbourhood_cleansed_Arcadia                                 -53.8893      7.989     -6.745      0.000     -69.548     -38.231
neighbourhood_cleansed_Arleta                                    4.4573     71.593      0.062      0.950    -135.864     144.778
neighbourhood_cleansed_Arlington Heights                        -6.6900      7.997     -0.837      0.403     -22.364       8.984
neighbourhood_cleansed_Artesia                                 -59.2047     30.715     -1.928      0.054    -119.406       0.996
neighbourhood_cleansed_Athens                                  -38.0814     46.289     -0.823      0.411    -128.806      52.644
neighbourhood_cleansed_Atwater Village                          41.6270      8.084      5.150      0.000      25.783      57.471
neighbourhood_cleansed_Avalon                                  141.1131      7.773     18.154      0.000     125.878     156.348
neighbourhood_cleansed_Avocado Heights                          22.4099     30.006      0.747      0.455     -36.402      81.222
neighbourhood_cleansed_Azusa                                   -12.8239     14.011     -0.915      0.360     -40.284      14.636
neighbourhood_cleansed_Baldwin Hills/Crenshaw                   -9.9572      9.609     -1.036      0.300     -28.791       8.877
neighbourhood_cleansed_Baldwin Park                            -23.5349     16.013     -1.470      0.142     -54.919       7.850
neighbourhood_cleansed_Bel-Air                                 572.4686     11.340     50.480      0.000     550.242     594.695
neighbourhood_cleansed_Bell                                     27.9930     46.260      0.605      0.545     -62.675     118.661
neighbourhood_cleansed_Bell Gardens                             42.8628     37.125      1.155      0.248     -29.902     115.628
neighbourhood_cleansed_Bellflower                              -32.0037     20.545     -1.558      0.119     -72.271       8.263
neighbourhood_cleansed_Beverly Crest                           283.2790      7.928     35.729      0.000     267.739     298.818
neighbourhood_cleansed_Beverly Grove                            13.5980      4.466      3.045      0.002       4.845      22.351
neighbourhood_cleansed_Beverly Hills                            81.2838      4.742     17.143      0.000      71.990      90.577
neighbourhood_cleansed_Beverlywood                               6.3070     13.331      0.473      0.636     -19.822      32.436
neighbourhood_cleansed_Boyle Heights                           -43.9903      8.369     -5.256      0.000     -60.394     -27.587
neighbourhood_cleansed_Bradbury                               -107.2258     74.109     -1.447      0.148    -252.478      38.026
neighbourhood_cleansed_Brentwood                                 6.2716      5.965      1.051      0.293      -5.420      17.963
neighbourhood_cleansed_Broadway-Manchester                      10.6013     26.435      0.401      0.688     -41.211      62.413
neighbourhood_cleansed_Burbank                                   1.9576      5.545      0.353      0.724      -8.911      12.826
neighbourhood_cleansed_Calabasas                               -36.2044     13.929     -2.599      0.009     -63.504      -8.905
neighbourhood_cleansed_Canoga Park                             -31.0914      9.908     -3.138      0.002     -50.511     -11.671
neighbourhood_cleansed_Carson                                  -15.2820     13.749     -1.112      0.266     -42.229      11.665
neighbourhood_cleansed_Carthay                                  16.3683      9.493      1.724      0.085      -2.238      34.975
neighbourhood_cleansed_Castaic                                  -6.3968     25.869     -0.247      0.805     -57.099      44.305
neighbourhood_cleansed_Castaic Canyons                         -58.6207     23.325     -2.513      0.012    -104.337     -12.904
neighbourhood_cleansed_Central-Alameda                          18.3179     19.697      0.930      0.352     -20.288      56.924
neighbourhood_cleansed_Century City                            283.0929     16.547     17.108      0.000     250.660     315.525
neighbourhood_cleansed_Cerritos                                -68.7428     19.633     -3.501      0.000    -107.223     -30.262
neighbourhood_cleansed_Charter Oak                            -110.1890     27.180     -4.054      0.000    -163.460     -56.918
neighbourhood_cleansed_Chatsworth                               50.6359     13.879      3.648      0.000      23.434      77.838
neighbourhood_cleansed_Chatsworth Reservoir                    -63.8616    277.089     -0.230      0.818    -606.949     479.225
neighbourhood_cleansed_Chesterfield Square                      -1.7394     29.012     -0.060      0.952     -58.601      55.122
neighbourhood_cleansed_Cheviot Hills                            43.6503     12.633      3.455      0.001      18.890      68.411
neighbourhood_cleansed_Chinatown                                17.5860      7.940      2.215      0.027       2.023      33.149
neighbourhood_cleansed_Citrus                                   74.7124     45.634      1.637      0.102     -14.729     164.154
neighbourhood_cleansed_Claremont                                23.5656     14.196      1.660      0.097      -4.258      51.389
neighbourhood_cleansed_Commerce                                -49.3005     67.265     -0.733      0.464    -181.139      82.538
neighbourhood_cleansed_Compton                                  16.3769     31.094      0.527      0.598     -44.566      77.320
neighbourhood_cleansed_Covina                                   -6.6394     18.235     -0.364      0.716     -42.379      29.100
neighbourhood_cleansed_Cudahy                                   -1.7906    138.566     -0.013      0.990    -273.377     269.796
neighbourhood_cleansed_Culver City                             -12.1396      5.994     -2.025      0.043     -23.888      -0.391
neighbourhood_cleansed_Cypress Park                            -26.1126     20.550     -1.271      0.204     -66.390      14.165
neighbourhood_cleansed_Del Aire                                -21.9706     15.576     -1.411      0.158     -52.499       8.558
neighbourhood_cleansed_Del Rey                                   0.0691      5.590      0.012      0.990     -10.888      11.026
neighbourhood_cleansed_Diamond Bar                             -39.5238      8.200     -4.820      0.000     -55.596     -23.452
neighbourhood_cleansed_Downey                                  -11.3477     11.175     -1.015      0.310     -33.251      10.556
neighbourhood_cleansed_Downtown                                 52.2278      3.791     13.776      0.000      44.797      59.659
neighbourhood_cleansed_Duarte                                  -45.4232     21.236     -2.139      0.032     -87.046      -3.800
neighbourhood_cleansed_Eagle Rock                              -20.0804      7.172     -2.800      0.005     -34.137      -6.024
neighbourhood_cleansed_East Hollywood                          -28.7197      4.368     -6.574      0.000     -37.282     -20.158
neighbourhood_cleansed_East Los Angeles                          4.6895     10.032      0.467      0.640     -14.974      24.353
neighbourhood_cleansed_East Pasadena                            -3.7166     14.871     -0.250      0.803     -32.863      25.430
neighbourhood_cleansed_East San Gabriel                        -33.4291     11.027     -3.032      0.002     -55.041     -11.817
neighbourhood_cleansed_East Whittier                           -45.5198     62.015     -0.734      0.463    -167.067      76.027
neighbourhood_cleansed_Echo Park                                 8.4500      4.561      1.852      0.064      -0.490      17.390
neighbourhood_cleansed_El Monte                                -24.9658      9.989     -2.499      0.012     -44.543      -5.388
neighbourhood_cleansed_El Segundo                               13.9122      8.667      1.605      0.108      -3.076      30.900
neighbourhood_cleansed_El Sereno                               -15.0423     10.561     -1.424      0.154     -35.741       5.656
neighbourhood_cleansed_Elysian Park                             31.1177     19.731      1.577      0.115      -7.555      69.790
neighbourhood_cleansed_Elysian Valley                          -17.6972     15.139     -1.169      0.242     -47.368      11.974
neighbourhood_cleansed_Encino                                   81.4999      7.102     11.475      0.000      67.580      95.420
neighbourhood_cleansed_Exposition Park                          13.0853      8.135      1.609      0.108      -2.859      29.030
neighbourhood_cleansed_Fairfax                                  27.0178      5.099      5.298      0.000      17.023      37.012
neighbourhood_cleansed_Florence                                  0.3860     27.175      0.014      0.989     -52.876      53.648
neighbourhood_cleansed_Florence-Firestone                       -5.9196     22.946     -0.258      0.796     -50.893      39.054
neighbourhood_cleansed_Gardena                                  -4.1790     10.260     -0.407      0.684     -24.288      15.930
neighbourhood_cleansed_Glassell Park                           -11.0213      9.042     -1.219      0.223     -28.744       6.701
neighbourhood_cleansed_Glendale                                -27.1198      4.800     -5.649      0.000     -36.529     -17.711
neighbourhood_cleansed_Glendora                                 -7.3528     12.927     -0.569      0.569     -32.689      17.983
neighbourhood_cleansed_Gramercy Park                            20.9501     19.535      1.072      0.284     -17.339      59.239
neighbourhood_cleansed_Granada Hills                           -40.1398     11.950     -3.359      0.001     -63.561     -16.718
neighbourhood_cleansed_Green Meadows                            38.5223     27.440      1.404      0.160     -15.260      92.304
neighbourhood_cleansed_Green Valley                           -172.9778     40.517     -4.269      0.000    -252.389     -93.566
neighbourhood_cleansed_Griffith Park                           -31.1528     27.986     -1.113      0.266     -86.005      23.699
neighbourhood_cleansed_Hacienda Heights                        -11.5127      7.177     -1.604      0.109     -25.579       2.554
neighbourhood_cleansed_Hancock Park                             14.6684      7.620      1.925      0.054      -0.267      29.604
neighbourhood_cleansed_Harbor City                             -11.4106     21.933     -0.520      0.603     -54.398      31.577
neighbourhood_cleansed_Harbor Gateway                          -14.5102     12.686     -1.144      0.253     -39.374      10.354
neighbourhood_cleansed_Harvard Heights                         -63.1468      8.414     -7.505      0.000     -79.638     -46.656
neighbourhood_cleansed_Harvard Park                            -33.0321     37.459     -0.882      0.378    -106.450      40.386
neighbourhood_cleansed_Hasley Canyon                            83.4202     76.919      1.085      0.278     -67.340     234.180
neighbourhood_cleansed_Hawaiian Gardens                         31.3407    123.944      0.253      0.800    -211.587     274.268
neighbourhood_cleansed_Hawthorne                               -23.5523      7.951     -2.962      0.003     -39.137      -7.968
neighbourhood_cleansed_Hermosa Beach                            62.6962      7.255      8.642      0.000      48.477      76.915
neighbourhood_cleansed_Highland Park                            -9.6903      6.333     -1.530      0.126     -22.103       2.722
neighbourhood_cleansed_Historic South-Central                  -13.6568     11.776     -1.160      0.246     -36.737       9.424
neighbourhood_cleansed_Hollywood                                14.3949      3.269      4.403      0.000       7.988      20.802
neighbourhood_cleansed_Hollywood Hills                          17.4870      3.959      4.417      0.000       9.727      25.247
neighbourhood_cleansed_Hollywood Hills West                    224.1419      4.506     49.745      0.000     215.311     232.973
neighbourhood_cleansed_Huntington Park                          15.1146     36.799      0.411      0.681     -57.010      87.239
neighbourhood_cleansed_Hyde Park                               -34.1253     10.226     -3.337      0.001     -54.169     -14.082
neighbourhood_cleansed_Industry                                -14.2073     12.562     -1.131      0.258     -38.828      10.413
neighbourhood_cleansed_Inglewood                                -1.0189      5.589     -0.182      0.855     -11.974       9.936
neighbourhood_cleansed_Irwindale                                -8.8693     37.807     -0.235      0.815     -82.970      65.231
neighbourhood_cleansed_Jefferson Park                          -15.0033      8.519     -1.761      0.078     -31.700       1.693
neighbourhood_cleansed_Koreatown                                 4.0427      4.098      0.987      0.324      -3.989      12.074
neighbourhood_cleansed_La Canada Flintridge                     27.3080     16.299      1.675      0.094      -4.637      59.253
neighbourhood_cleansed_La Crescenta-Montrose                    -1.4029     22.490     -0.062      0.950     -45.482      42.676
neighbourhood_cleansed_La Habra Heights                          5.1324     25.755      0.199      0.842     -45.347      55.611
neighbourhood_cleansed_La Mirada                                -3.7729     15.793     -0.239      0.811     -34.726      27.180
neighbourhood_cleansed_La Puente                               -23.0273     32.996     -0.698      0.485     -87.699      41.644
neighbourhood_cleansed_La Verne                                 -2.7141     13.879     -0.196      0.845     -29.917      24.489
neighbourhood_cleansed_Ladera Heights                          -53.8580     12.475     -4.317      0.000     -78.309     -29.407
neighbourhood_cleansed_Lake Balboa                             -15.2769      9.870     -1.548      0.122     -34.622       4.068
neighbourhood_cleansed_Lake Hughes                               5.8015     76.897      0.075      0.940    -144.915     156.518
neighbourhood_cleansed_Lake Los Angeles                          4.5172     45.036      0.100      0.920     -83.753      92.787
neighbourhood_cleansed_Lake View Terrace                       -73.0212     30.543     -2.391      0.017    -132.885     -13.157
neighbourhood_cleansed_Lakewood                                -14.0761     14.194     -0.992      0.321     -41.896      13.744
neighbourhood_cleansed_Lancaster                               -26.6488      9.366     -2.845      0.004     -45.006      -8.292
neighbourhood_cleansed_Larchmont                                13.4209      8.712      1.541      0.123      -3.654      30.496
neighbourhood_cleansed_Lawndale                                  2.7856     11.138      0.250      0.803     -19.045      24.616
neighbourhood_cleansed_Leimert Park                            -13.1696     11.694     -1.126      0.260     -36.090       9.750
neighbourhood_cleansed_Lennox                                  -47.7264     21.869     -2.182      0.029     -90.589      -4.864
neighbourhood_cleansed_Leona Valley                            -26.0775     92.404     -0.282      0.778    -207.186     155.031
neighbourhood_cleansed_Lincoln Heights                         -54.7260     10.561     -5.182      0.000     -75.426     -34.026
neighbourhood_cleansed_Lomita                                   -1.5047     19.042     -0.079      0.937     -38.826      35.816
neighbourhood_cleansed_Long Beach                                8.6512      3.557      2.432      0.015       1.680      15.623
neighbourhood_cleansed_Lopez/Kagel Canyons                      -6.4679     80.035     -0.081      0.936    -163.334     150.398
neighbourhood_cleansed_Los Feliz                                26.6432      4.857      5.486      0.000      17.124      36.162
neighbourhood_cleansed_Lynwood                                  39.6887     40.942      0.969      0.332     -40.556     119.933
neighbourhood_cleansed_Malibu                                  608.7134      6.284     96.863      0.000     596.396     621.030
neighbourhood_cleansed_Manchester Square                         1.8775     21.174      0.089      0.929     -39.622      43.377
neighbourhood_cleansed_Manhattan Beach                          67.3347      6.973      9.657      0.000      53.668      81.001
neighbourhood_cleansed_Mar Vista                                 8.9247      4.867      1.834      0.067      -0.614      18.463
neighbourhood_cleansed_Marina del Rey                           27.8250      6.686      4.162      0.000      14.720      40.929
neighbourhood_cleansed_Mayflower Village                       -62.6409     27.172     -2.305      0.021    -115.897      -9.384
neighbourhood_cleansed_Maywood                                   6.7728     40.520      0.167      0.867     -72.645      86.191
neighbourhood_cleansed_Mid-City                                 -9.1929      4.554     -2.019      0.044     -18.119      -0.267
neighbourhood_cleansed_Mid-Wilshire                              5.5557      4.100      1.355      0.175      -2.481      13.592
neighbourhood_cleansed_Mission Hills                             0.9854     28.260      0.035      0.972     -54.404      56.374
neighbourhood_cleansed_Monrovia                                -38.0462     10.499     -3.624      0.000     -58.624     -17.469
neighbourhood_cleansed_Montebello                              -46.4599     17.432     -2.665      0.008     -80.626     -12.294
neighbourhood_cleansed_Montecito Heights                       -44.1801      9.886     -4.469      0.000     -63.555     -24.805
neighbourhood_cleansed_Monterey Park                           -11.6922      6.335     -1.846      0.065     -24.109       0.725
neighbourhood_cleansed_Mount Washington                        -16.1626      8.134     -1.987      0.047     -32.106      -0.219
neighbourhood_cleansed_North El Monte                            3.1847     30.717      0.104      0.917     -57.019      63.388
neighbourhood_cleansed_North Hills                              -8.0230     12.770     -0.628      0.530     -33.051      17.005
neighbourhood_cleansed_North Hollywood                          -5.6703      4.845     -1.170      0.242     -15.166       3.826
neighbourhood_cleansed_North Whittier                         -239.8748     80.035     -2.997      0.003    -396.740     -83.010
neighbourhood_cleansed_Northeast Antelope Valley              -169.2067     80.043     -2.114      0.035    -326.089     -12.325
neighbourhood_cleansed_Northridge                              -23.1073      9.583     -2.411      0.016     -41.890      -4.325
neighbourhood_cleansed_Northwest Antelope Valley               -91.9144     41.858     -2.196      0.028    -173.955      -9.873
neighbourhood_cleansed_Northwest Palmdale                      -37.2613     38.893     -0.958      0.338    -113.491      38.968
neighbourhood_cleansed_Norwalk                                 -37.7158     15.556     -2.425      0.015     -68.204      -7.227
neighbourhood_cleansed_Pacific Palisades                        99.4777      6.870     14.481      0.000      86.013     112.942
neighbourhood_cleansed_Pacoima                                 -55.5413     29.170     -1.904      0.057    -112.713       1.631
neighbourhood_cleansed_Palmdale                                -34.7230     10.829     -3.206      0.001     -55.948     -13.498
neighbourhood_cleansed_Palms                                   -20.5700      5.664     -3.631      0.000     -31.672      -9.468
neighbourhood_cleansed_Palos Verdes Estates                    -10.2208     25.031     -0.408      0.683     -59.281      38.839
neighbourhood_cleansed_Panorama City                           -39.0722     15.555     -2.512      0.012     -69.560      -8.584
neighbourhood_cleansed_Paramount                                20.4678     30.717      0.666      0.505     -39.737      80.672
neighbourhood_cleansed_Pasadena                                  9.0413      4.916      1.839      0.066      -0.594      18.677
neighbourhood_cleansed_Pico Rivera                             -46.2972     15.445     -2.997      0.003     -76.570     -16.025
neighbourhood_cleansed_Pico-Robertson                           -6.6060      5.956     -1.109      0.267     -18.279       5.067
neighbourhood_cleansed_Pico-Union                              -48.3159      6.379     -7.574      0.000     -60.818     -35.813
neighbourhood_cleansed_Playa Vista                              -5.3927      8.924     -0.604      0.546     -22.883      12.098
neighbourhood_cleansed_Playa del Rey                             2.2795      7.844      0.291      0.771     -13.094      17.653
neighbourhood_cleansed_Pomona                                  -37.2359      8.947     -4.162      0.000     -54.773     -19.699
neighbourhood_cleansed_Porter Ranch                            -98.1244     18.667     -5.257      0.000    -134.711     -61.538
neighbourhood_cleansed_Quartz Hill                             -49.0747     80.031     -0.613      0.540    -205.933     107.783
neighbourhood_cleansed_Rancho Dominguez                        -26.2258     55.483     -0.473      0.636    -134.972      82.520
neighbourhood_cleansed_Rancho Palos Verdes                      11.6454     13.198      0.882      0.378     -14.222      37.513
neighbourhood_cleansed_Rancho Park                              35.0039     10.104      3.464      0.001      15.201      54.807
neighbourhood_cleansed_Redondo Beach                             1.9740      6.058      0.326      0.745      -9.900      13.848
neighbourhood_cleansed_Reseda                                  -30.7377      8.104     -3.793      0.000     -46.620     -14.855
neighbourhood_cleansed_Ridge Route                              -3.5503     56.629     -0.063      0.950    -114.542     107.441
neighbourhood_cleansed_Rolling Hills                            41.6820     63.647      0.655      0.513     -83.064     166.428
neighbourhood_cleansed_Rolling Hills Estates                  -154.6773     35.886     -4.310      0.000    -225.014     -84.341
neighbourhood_cleansed_Rosemead                                -37.5052     10.802     -3.472      0.001     -58.677     -16.333
neighbourhood_cleansed_Rowland Heights                         -38.1943      5.130     -7.446      0.000     -48.248     -28.140
neighbourhood_cleansed_San Dimas                               -39.6256     19.539     -2.028      0.043     -77.922      -1.330
neighbourhood_cleansed_San Fernando                            -23.5066     23.326     -1.008      0.314     -69.225      22.211
neighbourhood_cleansed_San Gabriel                             -19.8086      7.837     -2.528      0.011     -35.169      -4.448
neighbourhood_cleansed_San Marino                               21.6682     26.928      0.805      0.421     -31.109      74.445
neighbourhood_cleansed_San Pasqual                             -28.0859     28.706     -0.978      0.328     -84.349      28.177
neighbourhood_cleansed_San Pedro                                -2.7351      8.826     -0.310      0.757     -20.035      14.564
neighbourhood_cleansed_Santa Clarita                             0.2346      8.884      0.026      0.979     -17.178      17.647
neighbourhood_cleansed_Santa Fe Springs                        -12.8044     76.899     -0.167      0.868    -163.525     137.916
neighbourhood_cleansed_Santa Monica                             20.5165      3.891      5.273      0.000      12.890      28.143
neighbourhood_cleansed_Sawtelle                                 16.1383      4.330      3.727      0.000       7.651      24.625
neighbourhood_cleansed_Sepulveda Basin                        -256.4905     39.692     -6.462      0.000    -334.286    -178.695
neighbourhood_cleansed_Shadow Hills                            -14.9991     19.000     -0.789      0.430     -52.238      22.240
neighbourhood_cleansed_Sherman Oaks                             34.1860      5.049      6.770      0.000      24.289      44.083
neighbourhood_cleansed_Sierra Madre                             -9.2664     26.207     -0.354      0.724     -60.631      42.098
neighbourhood_cleansed_Signal Hill                             -55.7482     17.153     -3.250      0.001     -89.367     -22.130
neighbourhood_cleansed_Silver Lake                              14.8443      4.259      3.485      0.000       6.496      23.193
neighbourhood_cleansed_South Diamond Bar                       -31.6513     87.662     -0.361      0.718    -203.466     140.163
neighbourhood_cleansed_South El Monte                         -104.4920     36.174     -2.889      0.004    -175.391     -33.593
neighbourhood_cleansed_South Gate                               -7.7632     27.047     -0.287      0.774     -60.775      45.249
neighbourhood_cleansed_South Park                              -45.2272     21.873     -2.068      0.039     -88.097      -2.357
neighbourhood_cleansed_South Pasadena                          -10.9885     11.276     -0.975      0.330     -33.088      11.111
neighbourhood_cleansed_South San Gabriel                        -9.1599     17.593     -0.521      0.603     -43.641      25.321
neighbourhood_cleansed_South San Jose Hills                     27.7520     69.324      0.400      0.689    -108.122     163.626
neighbourhood_cleansed_South Whittier                           -6.6803     21.610     -0.309      0.757     -49.036      35.675
neighbourhood_cleansed_Southeast Antelope Valley                31.2356     42.341      0.738      0.461     -51.752     114.223
neighbourhood_cleansed_Stevenson Ranch                         -89.6734     32.549     -2.755      0.006    -153.468     -25.878
neighbourhood_cleansed_Studio City                              -1.5234      5.000     -0.305      0.761     -11.322       8.275
neighbourhood_cleansed_Sun Valley                               -8.8142      9.642     -0.914      0.361     -27.713      10.084
neighbourhood_cleansed_Sun Village                              70.3470     76.907      0.915      0.360     -80.389     221.083
neighbourhood_cleansed_Sunland                                  17.0241     30.716      0.554      0.579     -43.179      77.227
neighbourhood_cleansed_Sylmar                                  -11.7748     14.747     -0.798      0.425     -40.679      17.129
neighbourhood_cleansed_Tarzana                                 -19.8365      7.828     -2.534      0.011     -35.180      -4.493
neighbourhood_cleansed_Temple City                             -49.2619      8.002     -6.156      0.000     -64.945     -33.578
neighbourhood_cleansed_Toluca Lake                             -31.5744      9.087     -3.475      0.001     -49.384     -13.765
neighbourhood_cleansed_Topanga                                  54.1308      6.762      8.005      0.000      40.877      67.384
neighbourhood_cleansed_Torrance                                 -1.3001      6.493     -0.200      0.841     -14.026      11.426
neighbourhood_cleansed_Tujunga                                 -48.8508     21.418     -2.281      0.023     -90.830      -6.871
neighbourhood_cleansed_Tujunga Canyons                          62.8182    277.090      0.227      0.821    -480.269     605.906
neighbourhood_cleansed_Unincorporated Catalina Island          405.0977     46.286      8.752      0.000     314.378     495.818
neighbourhood_cleansed_Unincorporated Santa Monica Mountains   243.5817     10.354     23.525      0.000     223.288     263.875
neighbourhood_cleansed_Unincorporated Santa Susana Mountains    87.6145     28.702      3.053      0.002      31.360     143.869
neighbourhood_cleansed_Universal City                          -59.9851     43.358     -1.383      0.167    -144.966      24.995
neighbourhood_cleansed_University Park                         -68.4764     11.152     -6.140      0.000     -90.335     -46.618
neighbourhood_cleansed_Val Verde                                66.5868     76.913      0.866      0.387     -84.161     217.334
neighbourhood_cleansed_Valinda                                 -26.2801     26.555     -0.990      0.322     -78.327      25.767
neighbourhood_cleansed_Valley Glen                              -2.4783      6.717     -0.369      0.712     -15.644      10.687
neighbourhood_cleansed_Valley Village                           -1.9271      7.883     -0.244      0.807     -17.378      13.523
neighbourhood_cleansed_Van Nuys                                -25.4801      5.929     -4.297      0.000     -37.102     -13.859
neighbourhood_cleansed_Venice                                   39.6395      3.269     12.126      0.000      33.233      46.046
neighbourhood_cleansed_Vermont Knolls                           14.2095     28.558      0.498      0.619     -41.764      70.183
neighbourhood_cleansed_Vermont Square                          -22.6281      9.660     -2.342      0.019     -41.562      -3.694
neighbourhood_cleansed_Vermont Vista                            15.3615     40.086      0.383      0.702     -63.206      93.929
neighbourhood_cleansed_Vermont-Slauson                         -46.2375     24.550     -1.883      0.060     -94.355       1.880
neighbourhood_cleansed_Vernon                                   77.6409    138.569      0.560      0.575    -193.950     349.231
neighbourhood_cleansed_Veterans Administration                 -30.0477     24.278     -1.238      0.216     -77.631      17.536
neighbourhood_cleansed_View Park-Windsor Hills                  36.0807     10.573      3.413      0.001      15.358      56.804
neighbourhood_cleansed_Vincent                                -244.1906     74.110     -3.295      0.001    -389.443     -98.938
neighbourhood_cleansed_Walnut                                  -62.8057     12.754     -4.924      0.000     -87.803     -37.809
neighbourhood_cleansed_Watts                                    15.4787     22.707      0.682      0.495     -29.026      59.984
neighbourhood_cleansed_West Adams                               -0.4608      9.472     -0.049      0.961     -19.025      18.104
neighbourhood_cleansed_West Carson                              -7.4557     18.279     -0.408      0.683     -43.282      28.371
neighbourhood_cleansed_West Compton                              9.8612    104.760      0.094      0.925    -195.466     215.188
neighbourhood_cleansed_West Covina                             -58.3698     11.210     -5.207      0.000     -80.341     -36.398
neighbourhood_cleansed_West Hills                              -52.6579     10.869     -4.845      0.000     -73.960     -31.356
neighbourhood_cleansed_West Hollywood                           19.5654      4.185      4.675      0.000      11.363      27.768
neighbourhood_cleansed_West Los Angeles                        -21.7137      7.482     -2.902      0.004     -36.379      -7.049
neighbourhood_cleansed_West Puente Valley                     -184.2500     32.350     -5.695      0.000    -247.656    -120.845
neighbourhood_cleansed_West Whittier-Los Nietos                -11.3609     33.227     -0.342      0.732     -76.485      53.763
neighbourhood_cleansed_Westchester                               3.2875      5.593      0.588      0.557      -7.675      14.250
neighbourhood_cleansed_Westlake                                  7.2955      3.995      1.826      0.068      -0.535      15.126
neighbourhood_cleansed_Westlake Village                        120.4332     40.508      2.973      0.003      41.039     199.827
neighbourhood_cleansed_Westmont                                 32.8261     21.804      1.505      0.132      -9.910      75.562
neighbourhood_cleansed_Westwood                                  6.6821      4.476      1.493      0.135      -2.090      15.455
neighbourhood_cleansed_Whittier                                -40.3511     11.366     -3.550      0.000     -62.628     -18.074
neighbourhood_cleansed_Willowbrook                             -83.7369     29.512     -2.837      0.005    -141.579     -25.895
neighbourhood_cleansed_Wilmington                               12.1511     26.674      0.456      0.649     -40.129      64.431
neighbourhood_cleansed_Windsor Square                            1.6723     13.262      0.126      0.900     -24.321      27.666
neighbourhood_cleansed_Winnetka                                -23.4974     10.817     -2.172      0.030     -44.698      -2.297
neighbourhood_cleansed_Woodland Hills                          -11.0454      5.569     -1.984      0.047     -21.960      -0.131
bathrooms                                                      140.1415      0.883    158.629      0.000     138.410     141.873
bedrooms                                                        21.1746      0.806     26.283      0.000      19.596      22.754
beds                                                            -9.9673      0.429    -23.261      0.000     -10.807      -9.127
cleaning_fee                                                     1.2008      0.008    153.832      0.000       1.185       1.216
minimum_nights                                                  -0.5107      0.022    -22.760      0.000      -0.555      -0.467
const                                                         -664.7340    196.008     -3.391      0.001   -1048.904    -280.564
==============================================================================
Omnibus:                   781545.739   Durbin-Watson:                   1.881
Prob(Omnibus):                  0.000   Jarque-Bera (JB):       5319716587.820
Skew:                          18.311   Prob(JB):                         0.00
Kurtosis:                     589.090   Cond. No.                     2.25e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.11e-21. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [ ]:
 
In [92]:
ols =stat.OLS(y_train, noscale_x_train)
ols_result =ols.fit()

Superhost has negative effect

In [ ]:
### Email Jinan to ask about property type 
In [ ]:
### Drop the hotel room types as it only has two listings. 
In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
In [93]:
print(ols_result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.426
Model:                            OLS   Adj. R-squared:                  0.425
Method:                 Least Squares   F-statistic:                     1001.
Date:                Sat, 05 Dec 2020   Prob (F-statistic):               0.00
Time:                        17:19:59   Log-Likelihood:            -2.6088e+06
No. Observations:              370236   AIC:                         5.218e+06
Df Residuals:                  369961   BIC:                         5.221e+06
Df Model:                         274                                         
Covariance Type:            nonrobust                                         
================================================================================================================================
                                                                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------------
host_is_superhost_0                                           -133.5204     24.542     -5.441      0.000    -181.622     -85.419
host_is_superhost_1                                           -141.9262     24.552     -5.781      0.000    -190.047     -93.805
property_type_Apartment                                        -75.6541     12.299     -6.151      0.000     -99.760     -51.549
property_type_Condominium                                      -91.7552     12.366     -7.420      0.000    -115.991     -67.519
property_type_Guesthouse                                       -36.8777     12.368     -2.982      0.003     -61.118     -12.637
property_type_House                                            -71.1596     12.301     -5.785      0.000     -95.269     -47.050
room_type_Entire home/apt                                       77.4344     36.941      2.096      0.036       5.031     149.838
room_type_Hotel room                                          -446.8670    159.747     -2.797      0.005    -759.966    -133.768
room_type_Private room                                          88.3273     36.938      2.391      0.017      15.929     160.725
room_type_Shared room                                            5.6587     36.988      0.153      0.878     -66.837      78.154
neighbourhood_cleansed_Acton                                  -101.8420     80.018     -1.273      0.203    -258.674      54.990
neighbourhood_cleansed_Adams-Normandie                          -8.1555     12.156     -0.671      0.502     -31.980      15.669
neighbourhood_cleansed_Agoura Hills                            -32.4348     15.556     -2.085      0.037     -62.924      -1.946
neighbourhood_cleansed_Agua Dulce                              -44.8240     31.879     -1.406      0.160    -107.306      17.658
neighbourhood_cleansed_Alhambra                                -15.1886      6.145     -2.472      0.013     -27.234      -3.144
neighbourhood_cleansed_Alondra Park                             -8.1338     28.536     -0.285      0.776     -64.063      47.795
neighbourhood_cleansed_Altadena                                 -6.0923      6.805     -0.895      0.371     -19.430       7.245
neighbourhood_cleansed_Angeles Crest                            38.9003    138.521      0.281      0.779    -232.597     310.397
neighbourhood_cleansed_Arcadia                                 -53.5650      7.954     -6.734      0.000     -69.155     -37.975
neighbourhood_cleansed_Arleta                                    6.3449     71.568      0.089      0.929    -133.926     146.616
neighbourhood_cleansed_Arlington Heights                        -5.0970      7.961     -0.640      0.522     -20.701      10.507
neighbourhood_cleansed_Artesia                                 -60.1427     30.698     -1.959      0.050    -120.309       0.024
neighbourhood_cleansed_Athens                                  -38.7407     46.269     -0.837      0.402    -129.427      51.946
neighbourhood_cleansed_Atwater Village                          42.3904      8.049      5.267      0.000      26.615      58.166
neighbourhood_cleansed_Avalon                                  139.7511      7.739     18.058      0.000     124.583     154.919
neighbourhood_cleansed_Avocado Heights                          27.8911     29.990      0.930      0.352     -30.888      86.670
neighbourhood_cleansed_Azusa                                   -11.7572     13.988     -0.841      0.401     -39.172      15.658
neighbourhood_cleansed_Baldwin Hills/Crenshaw                   -9.0171      9.579     -0.941      0.347     -27.792       9.758
neighbourhood_cleansed_Baldwin Park                            -23.5781     15.992     -1.474      0.140     -54.921       7.765
neighbourhood_cleansed_Bel-Air                                 573.3431     11.313     50.678      0.000     551.169     595.517
neighbourhood_cleansed_Bell                                     28.0622     46.240      0.607      0.544     -62.567     118.691
neighbourhood_cleansed_Bell Gardens                             38.5994     37.109      1.040      0.298     -34.133     111.332
neighbourhood_cleansed_Bellflower                              -32.3598     20.526     -1.577      0.115     -72.590       7.870
neighbourhood_cleansed_Beverly Crest                           285.5859      7.893     36.184      0.000     270.116     301.055
neighbourhood_cleansed_Beverly Grove                            15.1846      4.406      3.446      0.001       6.549      23.820
neighbourhood_cleansed_Beverly Hills                            82.3291      4.685     17.573      0.000      73.147      91.512
neighbourhood_cleansed_Beverlywood                               4.7024     13.309      0.353      0.724     -21.383      30.788
neighbourhood_cleansed_Boyle Heights                           -42.5180      8.335     -5.101      0.000     -58.855     -26.181
neighbourhood_cleansed_Bradbury                               -104.9160     74.083     -1.416      0.157    -250.117      40.285
neighbourhood_cleansed_Brentwood                                 7.4304      5.919      1.255      0.209      -4.171      19.032
neighbourhood_cleansed_Broadway-Manchester                      10.9632     26.417      0.415      0.678     -40.814      62.740
neighbourhood_cleansed_Burbank                                   2.9026      5.496      0.528      0.597      -7.869      13.674
neighbourhood_cleansed_Calabasas                               -33.6385     13.906     -2.419      0.016     -60.893      -6.384
neighbourhood_cleansed_Canoga Park                             -26.4553      9.881     -2.677      0.007     -45.822      -7.089
neighbourhood_cleansed_Carson                                  -13.7684     13.726     -1.003      0.316     -40.670      13.133
neighbourhood_cleansed_Carthay                                  18.6935      9.463      1.975      0.048       0.147      37.240
neighbourhood_cleansed_Castaic                                 -10.4142     25.853     -0.403      0.687     -61.085      40.257
neighbourhood_cleansed_Castaic Canyons                         -58.3610     23.307     -2.504      0.012    -104.042     -12.680
neighbourhood_cleansed_Central-Alameda                          18.1808     19.678      0.924      0.356     -20.387      56.749
neighbourhood_cleansed_Century City                            286.9593     16.527     17.363      0.000     254.566     319.352
neighbourhood_cleansed_Cerritos                                -68.4930     19.614     -3.492      0.000    -106.936     -30.050
neighbourhood_cleansed_Charter Oak                            -101.6263     27.166     -3.741      0.000    -154.870     -48.382
neighbourhood_cleansed_Chatsworth                               50.7810     13.856      3.665      0.000      23.624      77.938
neighbourhood_cleansed_Chatsworth Reservoir                    -67.6217    277.004     -0.244      0.807    -610.542     475.298
neighbourhood_cleansed_Chesterfield Square                      -0.8601     28.994     -0.030      0.976     -57.687      55.967
neighbourhood_cleansed_Cheviot Hills                            42.9463     12.609      3.406      0.001      18.232      67.660
neighbourhood_cleansed_Chinatown                                21.2078      7.906      2.682      0.007       5.712      36.704
neighbourhood_cleansed_Citrus                                   71.9986     45.615      1.578      0.114     -17.406     161.403
neighbourhood_cleansed_Claremont                                22.8476     14.174      1.612      0.107      -4.933      50.628
neighbourhood_cleansed_Commerce                                -46.4324     67.241     -0.691      0.490    -178.223      85.358
neighbourhood_cleansed_Compton                                  18.7818     31.076      0.604      0.546     -42.127      79.690
neighbourhood_cleansed_Covina                                   -5.7975     18.215     -0.318      0.750     -41.498      29.903
neighbourhood_cleansed_Cudahy                                   -1.0196    138.522     -0.007      0.994    -272.519     270.480
neighbourhood_cleansed_Culver City                             -10.5873      5.948     -1.780      0.075     -22.246       1.071
neighbourhood_cleansed_Cypress Park                            -24.5542     20.531     -1.196      0.232     -64.795      15.686
neighbourhood_cleansed_Del Aire                                -20.6143     15.554     -1.325      0.185     -51.100       9.872
neighbourhood_cleansed_Del Rey                                   1.0423      5.542      0.188      0.851      -9.820      11.905
neighbourhood_cleansed_Diamond Bar                             -38.9633      8.166     -4.772      0.000     -54.968     -22.959
neighbourhood_cleansed_Downey                                   -9.1355     11.148     -0.819      0.413     -30.986      12.715
neighbourhood_cleansed_Downtown                                 56.2612      3.725     15.104      0.000      48.960      63.562
neighbourhood_cleansed_Duarte                                  -44.3396     21.218     -2.090      0.037     -85.926      -2.754
neighbourhood_cleansed_Eagle Rock                              -19.3608      7.133     -2.714      0.007     -33.342      -5.380
neighbourhood_cleansed_East Hollywood                          -27.3903      4.307     -6.360      0.000     -35.832     -18.949
neighbourhood_cleansed_East Los Angeles                          5.4198     10.003      0.542      0.588     -14.187      25.026
neighbourhood_cleansed_East Pasadena                            -3.2592     14.849     -0.219      0.826     -32.362      25.844
neighbourhood_cleansed_East San Gabriel                        -33.9159     11.000     -3.083      0.002     -55.476     -12.356
neighbourhood_cleansed_East Whittier                           -46.8976     61.992     -0.757      0.449    -168.400      74.605
neighbourhood_cleansed_Echo Park                                 8.6874      4.503      1.929      0.054      -0.139      17.513
neighbourhood_cleansed_El Monte                                -22.2081      9.959     -2.230      0.026     -41.728      -2.688
neighbourhood_cleansed_El Segundo                               17.0050      8.635      1.969      0.049       0.080      33.930
neighbourhood_cleansed_El Sereno                               -13.0282     10.533     -1.237      0.216     -33.672       7.615
neighbourhood_cleansed_Elysian Park                             34.2444     19.712      1.737      0.082      -4.391      72.880
neighbourhood_cleansed_Elysian Valley                          -17.3044     15.117     -1.145      0.252     -46.933      12.324
neighbourhood_cleansed_Encino                                   84.4890      7.064     11.961      0.000      70.645      98.333
neighbourhood_cleansed_Exposition Park                          12.2593      8.102      1.513      0.130      -3.619      28.138
neighbourhood_cleansed_Fairfax                                  27.8807      5.047      5.525      0.000      17.990      37.772
neighbourhood_cleansed_Florence                                 -2.1636     27.158     -0.080      0.937     -55.393      51.066
neighbourhood_cleansed_Florence-Firestone                       -3.8000     22.928     -0.166      0.868     -48.738      41.138
neighbourhood_cleansed_Gardena                                  -4.1926     10.232     -0.410      0.682     -24.246      15.861
neighbourhood_cleansed_Glassell Park                           -10.6937      9.011     -1.187      0.235     -28.354       6.967
neighbourhood_cleansed_Glendale                                -25.3624      4.744     -5.346      0.000     -34.661     -16.064
neighbourhood_cleansed_Glendora                                 -7.9210     12.903     -0.614      0.539     -33.210      17.368
neighbourhood_cleansed_Gramercy Park                            25.7932     19.517      1.322      0.186     -12.460      64.047
neighbourhood_cleansed_Granada Hills                           -37.4670     11.925     -3.142      0.002     -60.839     -14.095
neighbourhood_cleansed_Green Meadows                            40.3211     27.422      1.470      0.141     -13.426      94.068
neighbourhood_cleansed_Green Valley                           -170.8632     40.498     -4.219      0.000    -250.238     -91.489
neighbourhood_cleansed_Griffith Park                           -37.7960     27.973     -1.351      0.177     -92.623      17.031
neighbourhood_cleansed_Hacienda Heights                        -10.6773      7.138     -1.496      0.135     -24.668       3.313
neighbourhood_cleansed_Hancock Park                             16.6796      7.584      2.199      0.028       1.816      31.543
neighbourhood_cleansed_Harbor City                             -10.7534     21.914     -0.491      0.624     -53.704      32.198
neighbourhood_cleansed_Harbor Gateway                          -13.3666     12.661     -1.056      0.291     -38.182      11.449
neighbourhood_cleansed_Harvard Heights                         -62.6397      8.380     -7.475      0.000     -79.064     -46.216
neighbourhood_cleansed_Harvard Park                            -32.7366     37.440     -0.874      0.382    -106.118      40.645
neighbourhood_cleansed_Hasley Canyon                            84.4743     76.893      1.099      0.272     -66.233     235.181
neighbourhood_cleansed_Hawaiian Gardens                         34.8458    123.904      0.281      0.779    -208.003     277.695
neighbourhood_cleansed_Hawthorne                               -22.9825      7.916     -2.903      0.004     -38.498      -7.467
neighbourhood_cleansed_Hermosa Beach                            62.2626      7.217      8.627      0.000      48.117      76.409
neighbourhood_cleansed_Highland Park                            -9.2765      6.290     -1.475      0.140     -21.605       3.052
neighbourhood_cleansed_Historic South-Central                  -10.8767     11.750     -0.926      0.355     -33.906      12.153
neighbourhood_cleansed_Hollywood                                16.2767      3.187      5.106      0.000      10.029      22.524
neighbourhood_cleansed_Hollywood Hills                          19.3336      3.891      4.968      0.000      11.706      26.961
neighbourhood_cleansed_Hollywood Hills West                    226.2936      4.446     50.898      0.000     217.580     235.008
neighbourhood_cleansed_Huntington Park                          19.4573     36.781      0.529      0.597     -52.632      91.546
neighbourhood_cleansed_Hyde Park                               -33.1428     10.198     -3.250      0.001     -53.130     -13.156
neighbourhood_cleansed_Industry                                -11.3630     12.538     -0.906      0.365     -35.936      13.210
neighbourhood_cleansed_Inglewood                                 0.9563      5.541      0.173      0.863      -9.903      11.816
neighbourhood_cleansed_Irwindale                                -1.3578     37.791     -0.036      0.971     -75.426      72.711
neighbourhood_cleansed_Jefferson Park                          -13.1270      8.485     -1.547      0.122     -29.758       3.504
neighbourhood_cleansed_Koreatown                                 5.3325      4.039      1.320      0.187      -2.584      13.249
neighbourhood_cleansed_La Canada Flintridge                     30.2368     16.278      1.858      0.063      -1.667      62.141
neighbourhood_cleansed_La Crescenta-Montrose                    -0.2255     22.471     -0.010      0.992     -44.268      43.817
neighbourhood_cleansed_La Habra Heights                          6.3923     25.737      0.248      0.804     -44.051      56.836
neighbourhood_cleansed_La Mirada                                -3.4825     15.771     -0.221      0.825     -34.394      27.429
neighbourhood_cleansed_La Puente                               -25.4848     32.979     -0.773      0.440     -90.123      39.153
neighbourhood_cleansed_La Verne                                  0.6515     13.857      0.047      0.962     -26.507      27.810
neighbourhood_cleansed_Ladera Heights                          -51.3377     12.450     -4.123      0.000     -75.740     -26.935
neighbourhood_cleansed_Lake Balboa                             -11.7271      9.841     -1.192      0.233     -31.016       7.562
neighbourhood_cleansed_Lake Hughes                              14.0763     76.872      0.183      0.855    -136.590     164.743
neighbourhood_cleansed_Lake Los Angeles                          0.7871     45.018      0.017      0.986     -87.447      89.021
neighbourhood_cleansed_Lake View Terrace                       -71.3254     30.525     -2.337      0.019    -131.154     -11.496
neighbourhood_cleansed_Lakewood                                -12.0673     14.171     -0.852      0.394     -39.843      15.708
neighbourhood_cleansed_Lancaster                               -24.2918      9.335     -2.602      0.009     -42.588      -5.995
neighbourhood_cleansed_Larchmont                                13.7581      8.680      1.585      0.113      -3.254      30.770
neighbourhood_cleansed_Lawndale                                  4.4931     11.111      0.404      0.686     -17.285      26.271
neighbourhood_cleansed_Leimert Park                            -12.1266     11.668     -1.039      0.299     -34.996      10.742
neighbourhood_cleansed_Lennox                                  -44.3893     21.851     -2.031      0.042     -87.216      -1.562
neighbourhood_cleansed_Leona Valley                            -26.4014     92.373     -0.286      0.775    -207.449     154.646
neighbourhood_cleansed_Lincoln Heights                         -52.3364     10.534     -4.968      0.000     -72.982     -31.691
neighbourhood_cleansed_Lomita                                    0.3551     19.022      0.019      0.985     -36.927      37.638
neighbourhood_cleansed_Long Beach                               11.1348      3.482      3.197      0.001       4.309      17.960
neighbourhood_cleansed_Lopez/Kagel Canyons                       0.9400     80.008      0.012      0.991    -155.874     157.754
neighbourhood_cleansed_Los Feliz                                26.6605      4.802      5.552      0.000      17.248      36.073
neighbourhood_cleansed_Lynwood                                  37.9153     40.923      0.926      0.354     -42.293     118.124
neighbourhood_cleansed_Malibu                                  609.6742      6.241     97.695      0.000     597.443     621.906
neighbourhood_cleansed_Manchester Square                         1.2028     21.155      0.057      0.955     -40.261      42.666
neighbourhood_cleansed_Manhattan Beach                          66.7999      6.934      9.633      0.000      53.209      80.391
neighbourhood_cleansed_Mar Vista                                 9.1190      4.812      1.895      0.058      -0.312      18.550
neighbourhood_cleansed_Marina del Rey                           31.3902      6.647      4.723      0.000      18.363      44.417
neighbourhood_cleansed_Mayflower Village                       -62.3335     27.154     -2.296      0.022    -115.555      -9.112
neighbourhood_cleansed_Maywood                                   2.2779     40.503      0.056      0.955     -77.107      81.663
neighbourhood_cleansed_Mid-City                                 -7.7991      4.495     -1.735      0.083     -16.609       1.011
neighbourhood_cleansed_Mid-Wilshire                              7.2723      4.035      1.802      0.072      -0.637      15.181
neighbourhood_cleansed_Mission Hills                             4.1506     28.242      0.147      0.883     -51.204      59.505
neighbourhood_cleansed_Monrovia                                -36.5844     10.471     -3.494      0.000     -57.107     -16.062
neighbourhood_cleansed_Montebello                              -46.9611     17.412     -2.697      0.007     -81.089     -12.834
neighbourhood_cleansed_Montecito Heights                       -44.5970      9.857     -4.525      0.000     -63.916     -25.278
neighbourhood_cleansed_Monterey Park                           -10.3246      6.292     -1.641      0.101     -22.656       2.007
neighbourhood_cleansed_Mount Washington                        -15.1585      8.100     -1.872      0.061     -31.033       0.716
neighbourhood_cleansed_North El Monte                           -3.2011     30.703     -0.104      0.917     -63.378      56.976
neighbourhood_cleansed_North Hills                              -0.3106     12.752     -0.024      0.981     -25.304      24.683
neighbourhood_cleansed_North Hollywood                          -3.5750      4.789     -0.746      0.455     -12.962       5.812
neighbourhood_cleansed_North Whittier                         -243.6398     80.008     -3.045      0.002    -400.452     -86.827
neighbourhood_cleansed_Northeast Antelope Valley              -163.1591     80.016     -2.039      0.041    -319.988      -6.330
neighbourhood_cleansed_Northridge                              -21.8250      9.553     -2.285      0.022     -40.548      -3.102
neighbourhood_cleansed_Northwest Antelope Valley               -92.3254     41.839     -2.207      0.027    -174.329     -10.321
neighbourhood_cleansed_Northwest Palmdale                      -38.7772     38.875     -0.997      0.319    -114.972      37.417
neighbourhood_cleansed_Norwalk                                 -37.0612     15.534     -2.386      0.017     -67.508      -6.615
neighbourhood_cleansed_Pacific Palisades                       100.0329      6.830     14.647      0.000      86.647     113.419
neighbourhood_cleansed_Pacoima                                 -57.3189     29.153     -1.966      0.049    -114.457      -0.181
neighbourhood_cleansed_Palmdale                                -31.2670     10.803     -2.894      0.004     -52.440     -10.094
neighbourhood_cleansed_Palms                                   -20.1068      5.617     -3.580      0.000     -31.116      -9.097
neighbourhood_cleansed_Palos Verdes Estates                    -13.4194     25.015     -0.536      0.592     -62.448      35.609
neighbourhood_cleansed_Panorama City                           -33.7895     15.536     -2.175      0.030     -64.239      -3.340
neighbourhood_cleansed_Paramount                                19.5082     30.700      0.635      0.525     -40.662      79.678
neighbourhood_cleansed_Pasadena                                 11.5291      4.862      2.371      0.018       2.000      21.058
neighbourhood_cleansed_Pico Rivera                             -43.7803     15.424     -2.839      0.005     -74.010     -13.550
neighbourhood_cleansed_Pico-Robertson                           -6.6710      5.911     -1.129      0.259     -18.256       4.914
neighbourhood_cleansed_Pico-Union                              -49.1053      6.336     -7.750      0.000     -61.524     -36.686
neighbourhood_cleansed_Playa Vista                              -3.1216      8.892     -0.351      0.726     -20.550      14.307
neighbourhood_cleansed_Playa del Rey                             3.4571      7.808      0.443      0.658     -11.846      18.760
neighbourhood_cleansed_Pomona                                  -36.1663      8.915     -4.057      0.000     -53.640     -18.692
neighbourhood_cleansed_Porter Ranch                            -95.6397     18.647     -5.129      0.000    -132.188     -59.091
neighbourhood_cleansed_Quartz Hill                             -45.0743     80.003     -0.563      0.573    -201.879     111.730
neighbourhood_cleansed_Rancho Dominguez                        -19.8115     55.463     -0.357      0.721    -128.517      88.894
neighbourhood_cleansed_Rancho Palos Verdes                      12.6899     13.174      0.963      0.335     -13.131      38.511
neighbourhood_cleansed_Rancho Park                              34.9384     10.075      3.468      0.001      15.191      54.686
neighbourhood_cleansed_Redondo Beach                             2.6911      6.013      0.448      0.654      -9.095      14.477
neighbourhood_cleansed_Reseda                                  -27.6947      8.069     -3.432      0.001     -43.510     -11.879
neighbourhood_cleansed_Ridge Route                              -0.0088     56.607     -0.000      1.000    -110.958     110.940
neighbourhood_cleansed_Rolling Hills                            50.0252     63.625      0.786      0.432     -74.678     174.729
neighbourhood_cleansed_Rolling Hills Estates                  -153.7309     35.868     -4.286      0.000    -224.032     -83.430
neighbourhood_cleansed_Rosemead                                -36.1754     10.774     -3.358      0.001     -57.293     -15.058
neighbourhood_cleansed_Rowland Heights                         -37.2454      5.077     -7.337      0.000     -47.196     -27.295
neighbourhood_cleansed_San Dimas                               -38.9075     19.520     -1.993      0.046     -77.166      -0.650
neighbourhood_cleansed_San Fernando                            -22.0619     23.308     -0.947      0.344     -67.744      23.620
neighbourhood_cleansed_San Gabriel                             -19.2089      7.801     -2.462      0.014     -34.499      -3.918
neighbourhood_cleansed_San Marino                               25.4529     26.910      0.946      0.344     -27.290      78.196
neighbourhood_cleansed_San Pasqual                             -24.9564     28.688     -0.870      0.384     -81.184      31.272
neighbourhood_cleansed_San Pedro                                 0.0597      8.795      0.007      0.995     -17.177      17.297
neighbourhood_cleansed_Santa Clarita                             3.0098      8.852      0.340      0.734     -14.340      20.360
neighbourhood_cleansed_Santa Fe Springs                        -22.3077     76.876     -0.290      0.772    -172.982     128.367
neighbourhood_cleansed_Santa Monica                             22.0169      3.823      5.760      0.000      14.525      29.509
neighbourhood_cleansed_Sawtelle                                 18.4082      4.269      4.312      0.000      10.042      26.775
neighbourhood_cleansed_Sepulveda Basin                        -255.2112     39.674     -6.433      0.000    -332.970    -177.452
neighbourhood_cleansed_Shadow Hills                            -12.8643     18.980     -0.678      0.498     -50.065      24.336
neighbourhood_cleansed_Sherman Oaks                             36.6685      4.996      7.339      0.000      26.876      46.461
neighbourhood_cleansed_Sierra Madre                             -8.1355     26.189     -0.311      0.756     -59.465      43.194
neighbourhood_cleansed_Signal Hill                             -55.2647     17.132     -3.226      0.001     -88.843     -21.686
neighbourhood_cleansed_Silver Lake                              16.1592      4.197      3.851      0.000       7.934      24.384
neighbourhood_cleansed_South Diamond Bar                       -25.8511     87.633     -0.295      0.768    -197.608     145.906
neighbourhood_cleansed_South El Monte                         -100.3936     36.156     -2.777      0.005    -171.258     -29.529
neighbourhood_cleansed_South Gate                               -6.9549     27.030     -0.257      0.797     -59.932      46.022
neighbourhood_cleansed_South Park                              -45.8174     21.855     -2.096      0.036     -88.652      -2.983
neighbourhood_cleansed_South Pasadena                           -9.6448     11.249     -0.857      0.391     -31.692      12.402
neighbourhood_cleansed_South San Gabriel                       -10.3835     17.573     -0.591      0.555     -44.827      24.060
neighbourhood_cleansed_South San Jose Hills                     26.2865     69.300      0.379      0.704    -109.539     162.112
neighbourhood_cleansed_South Whittier                           -5.2863     21.592     -0.245      0.807     -47.605      37.032
neighbourhood_cleansed_Southeast Antelope Valley                31.0352     42.322      0.733      0.463     -51.915     113.985
neighbourhood_cleansed_Stevenson Ranch                         -85.6831     32.531     -2.634      0.008    -149.444     -21.923
neighbourhood_cleansed_Studio City                               0.5380      4.945      0.109      0.913      -9.155      10.231
neighbourhood_cleansed_Sun Valley                               -5.7665      9.613     -0.600      0.549     -24.607      13.074
neighbourhood_cleansed_Sun Village                              82.0107     76.883      1.067      0.286     -68.678     232.700
neighbourhood_cleansed_Sunland                                  19.8726     30.698      0.647      0.517     -40.295      80.041
neighbourhood_cleansed_Sylmar                                  -11.2519     14.725     -0.764      0.445     -40.112      17.609
neighbourhood_cleansed_Tarzana                                 -17.4822      7.793     -2.243      0.025     -32.755      -2.209
neighbourhood_cleansed_Temple City                             -47.6884      7.967     -5.986      0.000     -63.303     -32.074
neighbourhood_cleansed_Toluca Lake                             -29.8968      9.055     -3.302      0.001     -47.644     -12.149
neighbourhood_cleansed_Topanga                                  55.4604      6.721      8.252      0.000      42.287      68.633
neighbourhood_cleansed_Torrance                                  0.3194      6.451      0.050      0.961     -12.323      12.962
neighbourhood_cleansed_Tujunga                                 -47.7527     21.400     -2.231      0.026     -89.695      -5.810
neighbourhood_cleansed_Tujunga Canyons                          77.1103    277.005      0.278      0.781    -465.812     620.033
neighbourhood_cleansed_Unincorporated Catalina Island          404.3530     46.267      8.740      0.000     313.671     495.035
neighbourhood_cleansed_Unincorporated Santa Monica Mountains   245.2676     10.326     23.754      0.000     225.030     265.505
neighbourhood_cleansed_Unincorporated Santa Susana Mountains    87.1175     28.684      3.037      0.002      30.897     143.338
neighbourhood_cleansed_Universal City                          -55.1037     43.339     -1.271      0.204    -140.048      29.840
neighbourhood_cleansed_University Park                         -70.8888     11.128     -6.370      0.000     -92.699     -49.078
neighbourhood_cleansed_Val Verde                                75.4890     76.888      0.982      0.326     -75.209     226.187
neighbourhood_cleansed_Valinda                                 -27.0072     26.538     -1.018      0.309     -79.020      25.006
neighbourhood_cleansed_Valley Glen                              -0.2315      6.676     -0.035      0.972     -13.316      12.854
neighbourhood_cleansed_Valley Village                            0.2310      7.848      0.029      0.977     -15.150      15.612
neighbourhood_cleansed_Van Nuys                                -22.5752      5.884     -3.837      0.000     -34.108     -11.043
neighbourhood_cleansed_Venice                                   40.7154      3.187     12.775      0.000      34.469      46.962
neighbourhood_cleansed_Vermont Knolls                           10.2944     28.542      0.361      0.718     -45.648      66.237
neighbourhood_cleansed_Vermont Square                          -20.9370      9.630     -2.174      0.030     -39.812      -2.062
neighbourhood_cleansed_Vermont Vista                            14.8491     40.068      0.371      0.711     -63.682      93.381
neighbourhood_cleansed_Vermont-Slauson                         -44.1568     24.532     -1.800      0.072     -92.239       3.925
neighbourhood_cleansed_Vernon                                   84.8923    138.525      0.613      0.540    -186.613     356.397
neighbourhood_cleansed_Veterans Administration                 -27.9663     24.260     -1.153      0.249     -75.515      19.582
neighbourhood_cleansed_View Park-Windsor Hills                  38.1414     10.545      3.617      0.000      17.473      58.809
neighbourhood_cleansed_Vincent                                -244.8165     74.084     -3.305      0.001    -390.018     -99.615
neighbourhood_cleansed_Walnut                                  -59.6075     12.730     -4.683      0.000     -84.557     -34.658
neighbourhood_cleansed_Watts                                    31.7798     22.710      1.399      0.162     -12.731      76.290
neighbourhood_cleansed_West Adams                                1.3356      9.441      0.141      0.888     -17.169      19.840
neighbourhood_cleansed_West Carson                              -7.2083     18.259     -0.395      0.693     -42.996      28.579
neighbourhood_cleansed_West Compton                             17.2397    104.727      0.165      0.869    -188.022     222.501
neighbourhood_cleansed_West Covina                             -55.6305     11.184     -4.974      0.000     -77.550     -33.711
neighbourhood_cleansed_West Hills                              -52.1646     10.841     -4.812      0.000     -73.413     -30.916
neighbourhood_cleansed_West Hollywood                           20.8215      4.121      5.052      0.000      12.744      28.899
neighbourhood_cleansed_West Los Angeles                        -21.2498      7.445     -2.854      0.004     -35.843      -6.657
neighbourhood_cleansed_West Puente Valley                     -184.8430     32.332     -5.717      0.000    -248.214    -121.472
neighbourhood_cleansed_West Whittier-Los Nietos                 -7.3510     33.209     -0.221      0.825     -72.441      57.739
neighbourhood_cleansed_Westchester                               5.3497      5.545      0.965      0.335      -5.518      16.217
neighbourhood_cleansed_Westlake                                 10.2891      3.930      2.618      0.009       2.587      17.991
neighbourhood_cleansed_Westlake Village                        120.2296     40.489      2.969      0.003      40.872     199.587
neighbourhood_cleansed_Westmont                                 38.0801     21.787      1.748      0.080      -4.622      80.782
neighbourhood_cleansed_Westwood                                  9.1625      4.417      2.075      0.038       0.506      17.819
neighbourhood_cleansed_Whittier                                -37.8071     11.339     -3.334      0.001     -60.032     -15.582
neighbourhood_cleansed_Willowbrook                             -82.1912     29.494     -2.787      0.005    -139.998     -24.384
neighbourhood_cleansed_Wilmington                               11.0762     26.656      0.416      0.678     -41.169      63.322
neighbourhood_cleansed_Windsor Square                            1.4635     13.239      0.111      0.912     -24.484      27.411
neighbourhood_cleansed_Winnetka                                -19.0360     10.791     -1.764      0.078     -40.186       2.114
neighbourhood_cleansed_Woodland Hills                           -8.8186      5.520     -1.598      0.110     -19.637       2.000
bathrooms                                                      139.7979      0.883    158.236      0.000     138.066     141.529
bedrooms                                                        21.7480      0.806     26.972      0.000      20.168      23.328
beds                                                            -9.4181      0.430    -21.907      0.000     -10.261      -8.575
amenities                                                       -0.6230      0.041    -15.061      0.000      -0.704      -0.542
cleaning_fee                                                     1.2046      0.008    154.286      0.000       1.189       1.220
minimum_nights                                                  -0.5185      0.022    -23.106      0.000      -0.562      -0.475
==============================================================================
Omnibus:                   781287.885   Durbin-Watson:                   1.881
Prob(Omnibus):                  0.000   Jarque-Bera (JB):       5314731761.477
Skew:                          18.297   Prob(JB):                         0.00
Kurtosis:                     588.816   Cond. No.                     2.28e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.11e-23. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [ ]:
 
In [ ]:
d = {'col1': [1, 2], 'col2': [3, 4]}
In [96]:
factors = {'features':['room_type_Entire home/apt','room_type_Private room','room_type_Shared room','neighbourhood_cleansed_Avalon',
                      'Bel-Air','Beverly Crest','Beverly Hills','Century City','Encino','Hasley Canyon','Hollywood Hills West','Malibu',
                      'Pacific Palisades','Unincorporated Catalina Island','Unincorporated Santa Monica Mountains','Unincorporated Santa Susana Mountains',
                      'Vernon','Westlake Village','bathrooms','bedrooms','beds','cleaning_fee','minimum_nights','amenities'],
          'coeffits':[77.4344,88.3273, 5.6587,139.7511, 573.3431 ,285.5859, 82.3291,286.9593, 84.4890,84.4743, 226.2936, 609.6742,
                     100.0329,404.3530,245.2676, 87.1175, 84.8923, 120.2296,139.7979,21.7480, -9.4181, 1.2046, -0.5185, -0.6230]}
In [213]:
factors_df = pd.DataFrame(data=factors)
In [214]:
factors_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/factors.csv')
In [101]:
print(os.getcwd('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/factors.csv'))
/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips
In [191]:
coef_df= pd.DataFrame(ols_result.summary().tables[1].data)

coef_df.columns = coef_df.iloc[0]
coef_df = coef_df.drop(0)
In [193]:
coef_df = coef_df.set_index(coef_df.columns[0])
In [195]:
coef_df = coef_df.astype(float)
In [196]:
errors = coef_df['coef'] - coef_df['[0.025']
In [197]:
coef_df['errors'] = errors
coef_df = coef_df.sort_values(by=['coef'])
variables = list(coef_df.index.values)
In [199]:
coef_df['variables'] = variables
In [201]:
coef_df
Out[201]:
coef std err t P>|t| [0.025 0.975] errors variables
room_type_Hotel room -446.8670 159.747 -2.797 0.005 -759.966 -133.768 313.0990 room_type_Hotel room
neighbourhood_cleansed_Sepulveda Basin -255.2112 39.674 -6.433 0.000 -332.970 -177.452 77.7588 neighbourhood_cleansed_Sepulveda Basin
neighbourhood_cleansed_Vincent -244.8165 74.084 -3.305 0.001 -390.018 -99.615 145.2015 neighbourhood_cleansed_Vincent
neighbourhood_cleansed_North Whittier -243.6398 80.008 -3.045 0.002 -400.452 -86.827 156.8122 neighbourhood_cleansed_North Whittier
neighbourhood_cleansed_West Puente Valley -184.8430 32.332 -5.717 0.000 -248.214 -121.472 63.3710 neighbourhood_cleansed_West Puente Valley
... ... ... ... ... ... ... ... ...
neighbourhood_cleansed_Beverly Crest 285.5859 7.893 36.184 0.000 270.116 301.055 15.4699 neighbourhood_cleansed_Beverly Crest
neighbourhood_cleansed_Century City 286.9593 16.527 17.363 0.000 254.566 319.352 32.3933 neighbourhood_cleansed_Century City
neighbourhood_cleansed_Unincorporated Catalina Island 404.3530 46.267 8.740 0.000 313.671 495.035 90.6820 neighbourhood_cleansed_Unincorporated Catalina...
neighbourhood_cleansed_Bel-Air 573.3431 11.313 50.678 0.000 551.169 595.517 22.1741 neighbourhood_cleansed_Bel-Air
neighbourhood_cleansed_Malibu 609.6742 6.241 97.695 0.000 597.443 621.906 12.2312 neighbourhood_cleansed_Malibu

278 rows × 8 columns

Neighbourhood only

In [202]:
without_neighbours = ['room_type_Entire home/apt','room_type_Private room','room_type_Shared room',
                     'room_type_Hotel room', 'bathrooms','bedrooms','beds','cleaning_fee','minimum_nights',
                      'amenities','property_type_Apartment','property_type_Condominium','property_type_Guesthouse',
                     'property_type_House']
In [206]:
with_neighbours_coeffcis = coef_df.drop(without_neighbours)
In [207]:
with_neighbours_coeffcis
Out[207]:
coef std err t P>|t| [0.025 0.975] errors variables
neighbourhood_cleansed_Sepulveda Basin -255.2112 39.674 -6.433 0.000 -332.970 -177.452 77.7588 neighbourhood_cleansed_Sepulveda Basin
neighbourhood_cleansed_Vincent -244.8165 74.084 -3.305 0.001 -390.018 -99.615 145.2015 neighbourhood_cleansed_Vincent
neighbourhood_cleansed_North Whittier -243.6398 80.008 -3.045 0.002 -400.452 -86.827 156.8122 neighbourhood_cleansed_North Whittier
neighbourhood_cleansed_West Puente Valley -184.8430 32.332 -5.717 0.000 -248.214 -121.472 63.3710 neighbourhood_cleansed_West Puente Valley
neighbourhood_cleansed_Green Valley -170.8632 40.498 -4.219 0.000 -250.238 -91.489 79.3748 neighbourhood_cleansed_Green Valley
... ... ... ... ... ... ... ... ...
neighbourhood_cleansed_Beverly Crest 285.5859 7.893 36.184 0.000 270.116 301.055 15.4699 neighbourhood_cleansed_Beverly Crest
neighbourhood_cleansed_Century City 286.9593 16.527 17.363 0.000 254.566 319.352 32.3933 neighbourhood_cleansed_Century City
neighbourhood_cleansed_Unincorporated Catalina Island 404.3530 46.267 8.740 0.000 313.671 495.035 90.6820 neighbourhood_cleansed_Unincorporated Catalina...
neighbourhood_cleansed_Bel-Air 573.3431 11.313 50.678 0.000 551.169 595.517 22.1741 neighbourhood_cleansed_Bel-Air
neighbourhood_cleansed_Malibu 609.6742 6.241 97.695 0.000 597.443 621.906 12.2312 neighbourhood_cleansed_Malibu

264 rows × 8 columns

In [208]:
fig, ax = plt.subplots(figsize=(15, 10))
with_neighbours_coeffcis.plot(x='variables', y='coef', kind='bar',
                 ax=ax, color='none', fontsize=22, 
                 ecolor='steelblue',capsize=0,
                 yerr='errors', legend=False)
plt.title('Coefficients of Features w/ 95% Confidence Intervals',fontsize=30)
ax.set_ylabel('Coefficients',fontsize=22)
ax.set_xlabel('',fontsize=22)
ax.scatter(x=pd.np.arange(coef_df.shape[0]), 
               marker='o', s=80, 
               y=coef_df['coef'], color='steelblue')
    
    # Line to define zero on the y-axis
ax.axhline(y=0, linestyle='--', color='red', linewidth=1)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:9: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead
  if __name__ == '__main__':
Out[208]:
<matplotlib.lines.Line2D at 0x7fea0447efd0>
In [ ]:
 
In [210]:
without_neighbours_df = coef_df.loc[without_neighbours]
In [211]:
fig, ax = plt.subplots(figsize=(15, 10))
without_neighbours_df.plot(x='variables', y='coef', kind='bar',
                 ax=ax, color='none', fontsize=22, 
                 ecolor='steelblue',capsize=0,
                 yerr='errors', legend=False)
plt.title('Coefficients of Features w/ 95% Confidence Intervals',fontsize=30)
ax.set_ylabel('Coefficients',fontsize=22)
ax.set_xlabel('',fontsize=22)
ax.scatter(x=pd.np.arange(coef_df.shape[0]), 
               marker='o', s=80, 
               y=coef_df['coef'], color='steelblue')
    
    # Line to define zero on the y-axis
ax.axhline(y=0, linestyle='--', color='red', linewidth=1)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:9: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead
  if __name__ == '__main__':
Out[211]:
<matplotlib.lines.Line2D at 0x7fea05f2ff10>
In [198]:
coef_df
Out[198]:
coef std err t P>|t| [0.025 0.975] errors
room_type_Hotel room -446.8670 159.747 -2.797 0.005 -759.966 -133.768 313.0990
neighbourhood_cleansed_Sepulveda Basin -255.2112 39.674 -6.433 0.000 -332.970 -177.452 77.7588
neighbourhood_cleansed_Vincent -244.8165 74.084 -3.305 0.001 -390.018 -99.615 145.2015
neighbourhood_cleansed_North Whittier -243.6398 80.008 -3.045 0.002 -400.452 -86.827 156.8122
neighbourhood_cleansed_West Puente Valley -184.8430 32.332 -5.717 0.000 -248.214 -121.472 63.3710
... ... ... ... ... ... ... ...
neighbourhood_cleansed_Beverly Crest 285.5859 7.893 36.184 0.000 270.116 301.055 15.4699
neighbourhood_cleansed_Century City 286.9593 16.527 17.363 0.000 254.566 319.352 32.3933
neighbourhood_cleansed_Unincorporated Catalina Island 404.3530 46.267 8.740 0.000 313.671 495.035 90.6820
neighbourhood_cleansed_Bel-Air 573.3431 11.313 50.678 0.000 551.169 595.517 22.1741
neighbourhood_cleansed_Malibu 609.6742 6.241 97.695 0.000 597.443 621.906 12.2312

278 rows × 7 columns

In [ ]:
 
In [181]:
def coefplot(results):
    coef_df= pd.DataFrame(ols_result.summary().tables[1].data)
    coef_df.columns = coef_df.iloc[0]
    coef_df = coef_df.drop(0)
    coef_df = coef_df.set_index(coef_df.columns[0])
    coef_df = coef_df.astype(float)
    errors = coef_df['coef'] - coef_df['[0.025']
    coef_df['errors'] = errors
    coef_df = coef_df.sort_values(by=['coef'])
    variables = list(coef_df.index.values)
    coef_df['variables'] = variables
    sns.set_context("poster")
    fig, ax = plt.subplots(figsize=(15, 10))
    coef_df.plot(x='variables', y='coef', kind='bar',
                 ax=ax, color='none', fontsize=22, 
                 ecolor='steelblue',capsize=0,
                 yerr='errors', legend=False)
    plt.title('Coefficients of Features w/ 95% Confidence Intervals',fontsize=30)
    ax.set_ylabel('Coefficients',fontsize=22)
    ax.set_xlabel('',fontsize=22)
    ax.scatter(x=pd.np.arange(coef_df.shape[0]), 
               marker='o', s=80, 
               y=coef_df['coef'], color='steelblue')
    
    # Line to define zero on the y-axis
    ax.axhline(y=0, linestyle='--', color='red', linewidth=1)
    
    return plt.show()
    
    
    
    
    
    
    
    
In [182]:
coefplot(ols_result)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:21: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead

Seperate Coefficients

In [ ]:
 
In [177]:
pd.DataFrame(ols_result.summary().tables[1].data)
Out[177]:
0 1 2 3 4 5 6
0 coef std err t P>|t| [0.025 0.975]
1 host_is_superhost_0 -133.5204 24.542 -5.441 0.000 -181.622 -85.419
2 host_is_superhost_1 -141.9262 24.552 -5.781 0.000 -190.047 -93.805
3 property_type_Apartment -75.6541 12.299 -6.151 0.000 -99.760 -51.549
4 property_type_Condominium -91.7552 12.366 -7.420 0.000 -115.991 -67.519
... ... ... ... ... ... ... ...
274 bedrooms 21.7480 0.806 26.972 0.000 20.168 23.328
275 beds -9.4181 0.430 -21.907 0.000 -10.261 -8.575
276 amenities -0.6230 0.041 -15.061 0.000 -0.704 -0.542
277 cleaning_fee 1.2046 0.008 154.286 0.000 1.189 1.220
278 minimum_nights -0.5185 0.022 -23.106 0.000 -0.562 -0.475

279 rows × 7 columns

In [ ]:
 
In [102]:
factors_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/factors_df.csv')
In [98]:
factors.to_frame()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-98-407a0b9ec4e3> in <module>
----> 1 factors.to_frame()

AttributeError: 'dict' object has no attribute 'to_frame'
In [ ]:
neighbourhood_cleansed_Avalon : 135.6014
    
    
In [83]:
ols_result.summary()[0]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-83-c83a9503d1e1> in <module>
----> 1 ols_result.summary()[0]

TypeError: 'Summary' object is not subscriptable

Linear Regression with predict

In [11]:
from sklearn import metrics

Split the dataset to 80% of training and 20% of testing

In [36]:
y_train= airbnb_with_dummies['price']
In [ ]:
no_sale_x_train
In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(noscale_x_train, y_train, test_size = 0.2, random_state = 462)
In [328]:
airbnb_linreg = LinearRegression()
In [329]:
airbnb_linreg.fit(X_train, Y_train)
Out[329]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [353]:
airbnb_linreg.score(X_train, Y_train)
Out[353]:
0.42307152745767174
In [361]:
airbnb_linreg.score(X_test,Y_test )
Out[361]:
0.43476554480341634
In [330]:
airbnb_y_predict = airbnb_linreg.predict(X_test)
In [333]:
print(metrics.mean_squared_error(Y_test,airbnb_y_predict))
71950.48813324355
In [270]:
np.sqrt(71950)
Out[270]:
268.23497162003315
In [162]:
#please take the sqrt of mean 



import math
math.sqrt(71950)
Out[162]:
268.23497162003315
In [ ]:
sirbnb_y_predict.score()
In [ ]:
airbnb_dum

Cross_validation linear regression

In [58]:
noscale_x_train.head()
Out[58]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills bathrooms bedrooms beds amenities cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 2.0 2.0 3.0 32 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 1.0 3.0 3.0 41 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 0 1.5 1.0 1.0 43 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 1.0 1.0 1.0 12 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 0 1.0 1.0 1.0 20 75.0 2

5 rows × 278 columns

In [56]:
y_train.shape
Out[56]:
(370236,)
In [75]:
airbnb_linear = LinearRegression()
cv_linear = cross_val_score(airbnb_linear, noscale_x_train, y_train, scoring  ='neg_mean_squared_error', cv = 10 )
In [80]:
mean(np.sqrt(absolute(cv_linear)))
Out[80]:
837464635.4185681
In [67]:
noscale_x_train.head()
Out[67]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills bathrooms bedrooms beds amenities cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 2.0 2.0 3.0 32 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 1.0 3.0 3.0 41 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 0 1.5 1.0 1.0 43 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 1.0 1.0 1.0 12 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 0 1.0 1.0 1.0 20 75.0 2

5 rows × 278 columns

In [61]:
square_root_mse = np.sqrt(-cv_linear)
square_root_mse.mean()
Out[61]:
1873542729.7720609
In [63]:
airbnb_linear_regression = LinearRegression()
airbnb_linear_mse = cross_val_score(airbnb_linear_regression , noscale_x_train, y_train, scoring ='neg_mean_squared_error', cv =10)
In [65]:
airbnb_linear_mse
Out[65]:
array([-7.04721329e+04, -5.84521208e+04, -6.71369725e+04, -1.02080727e+05,
       -8.99392991e+12, -7.81378652e+04, -7.11894675e+04, -3.50869612e+20,
       -8.47506922e+04, -8.32694174e+11])
In [51]:
mean(absolute(airbnb_linear_mse))
Out[51]:
3.508696215194728e+19
In [66]:
mean(np.sqrt(absolute(airbnb_linear_mse )))
Out[66]:
1873542729.7720609
In [272]:
np.sqrt(-airbnb_linear_mse).mean()
Out[272]:
4024322869.938216
In [ ]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 462)
In [64]:
lin_reg = LinearRegression()
In [104]:
y = airbnb_with_dummies['price']
In [105]:
x = airbnb_with_dummies.drop('price',axis = 1)
In [67]:
scores = cross_val_score(lin_reg, x, y,scoring = "neg_mean_squared_error", cv=10)
In [68]:
#no need to sqrt of mse(double check)

lin_rmse_scores = np.sqrt(-scores)
In [69]:
def display_scores(scores):
    print("Scores", scores)
    print("Mean", scores.mean())
    print("Standard Deviation:", scores.std())
In [70]:
display_scores(lin_rmse_scores)
Scores [263.2396039  246.10603172 298.32954574 270.0570037  287.47820772
 270.71407221 275.17728151 283.96015202 282.14897205 302.36250888]
Mean 277.9573379454447
Standard Deviation: 15.838611209397232

Applying Lasso Regression and compare to Linear Regression shown above.

In [8]:
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from numpy import arange
In [107]:
airbnb_lasso1 = Lasso(alpha=1.0)
In [243]:
model = Lasso(alpha=1.0)
In [262]:
model_2 = Lasso(alpha=0.0)
In [ ]:
model_2_results = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(model_2_results), std(model_2_results)))
In [256]:
scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
In [246]:
positive_scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(positive_scores), std(positive_scores)))
Mean MAE: 79369.892 (12005.734)
In [260]:
lasso_cv = LassoCV(alphas=arange(0, 1, 0.01), cv=10, n_jobs=-1)
lasso_cv.fit(x_train, y_train)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged.
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 12740367512.57365, tolerance: 4434903.583138396
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged.
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged.
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 13068001530.178192, tolerance: 4541757.386329397
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 12420741107.262943, tolerance: 4369947.248675438
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged.
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 13006404012.119423, tolerance: 4539632.777665123
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged.
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 12863791155.630623, tolerance: 4475810.407127853
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged.
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged.
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 12991337602.572575, tolerance: 4508273.820561129
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 12878889715.706734, tolerance: 4493194.088917457
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 12684277910.65305, tolerance: 4393909.588271972
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 13228966442.928371, tolerance: 4588041.206733395
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged.
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:471: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 12895499804.564903, tolerance: 4483219.280519044
  tol, rng, random, positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:1227: UserWarning: With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator
  model.fit(X, y)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:475: UserWarning: Coordinate descent with no regularization may lead to unexpected results and is discouraged.
  positive)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:475: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 14309862203.68245, tolerance: 4980977.899448478
  positive)
Out[260]:
LassoCV(alphas=array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0..., 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99]),
        copy_X=True, cv=10, eps=0.001, fit_intercept=True, max_iter=1000,
        n_alphas=100, n_jobs=-1, normalize=False, positive=False,
        precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
        verbose=False)
In [261]:
print('alpha: %f' % lasso_cv.alpha_)
alpha: 0.000000

You can the the average value of mean squared error after scaling the data dropped to 79358.379 compared

to 79369.892 before standardize the data. But not in a very big amount.

In [257]:
after_scaling = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(after_scaling), std(after_scaling)))
Mean MAE: 79358.379 (12003.833)
In [247]:
x_train.head()
Out[247]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Willowbrook neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills bathrooms bedrooms beds cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 0 2.0 2.0 3.0 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 0 1.0 3.0 3.0 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 1.5 1.0 1.0 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 1.0 1.0 1.0 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 0 0 1.0 1.0 1.0 75.0 2

5 rows × 277 columns

In [254]:
x_train[['bathrooms','bedrooms','beds','cleaning_fee','minimum_nights']] = scaler.fit_transform(x_train[['bathrooms','bedrooms','beds','cleaning_fee','minimum_nights']])
In [255]:
x_train.head()
Out[255]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Willowbrook neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills bathrooms bedrooms beds cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 0 0.648862 0.489356 0.581388 1.753823 0.029870
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 0 -0.514918 1.398624 0.581388 0.151685 -0.210491
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 0.066972 -0.419912 -0.625181 -0.019973 -0.018203
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 -0.514918 -0.419912 -0.625181 0.151685 -0.258563
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 0 0 -0.514918 -0.419912 -0.625181 -0.134411 -0.210491

5 rows × 277 columns

In [251]:
#Standardize dataset first
scaler = StandardScaler()
scaler.transform()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-251-694d944f20c7> in <module>
      1 #Standardize dataset first
      2 scaler = StandardScaler()
----> 3 scaler.transform()

TypeError: transform() missing 1 required positional argument: 'X'
In [242]:
airbnb_lasso1 = LassoCV(normalize = True, cv = 10).fit(x,y)
importance = np.abs(airbnb_lasso1.coef_)
In [347]:
#importance
In [240]:
from sklearn import linear_model
airbnb_lasso3 = linear_model.Lasso(alpha = 0.1)
In [ ]:
 
In [237]:
airbnb_lasso2 = Lasso()
airbnb_lasso2.fit(dlx_train, dly_train)
train_score = lasso.score(dlx_test, dly_test)

#airbnb_lasso2 = Lasso(alpha = 0.1)
#cv = cross_val_score(airbnb_lasso2, x,y ,scoring='neg_mean_absolute_error', cv =10)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-237-29845ad128d3> in <module>
----> 1 airbnb_lasso2 = Lasso(alpha = 0.1)
      2 cv = cross_val_score(airbnb_lasso2, x,y ,scoring='neg_mean_absolute_error', cv =10)

NameError: name 'Lasso' is not defined
In [10]:
#cv = absolute(scores)
X_train, X_test, Y_train, Y_test = train_test_split(noscale_x_train, y_train, test_size = 0.2, random_state = 462)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-5694ba9e6400> in <module>
      1 #cv = absolute(scores)
----> 2 X_train, X_test, Y_train, Y_test = train_test_split(noscale_x_train, y_train, test_size = 0.2, random_state = 462)

NameError: name 'train_test_split' is not defined
In [169]:
aribnb_lasso2 = Lasso(alpha = 0.01)
aribnb_lasso2.fit(X_train, Y_train)
train_score = aribnb_lasso2.score(X_train, Y_train)
test_score = aribnb_lasso2.score(X_test, Y_test)
In [176]:
importance = np.abs(aribnb_lasso2.coef_)
len(importance[importance != 0])
Out[176]:
173
In [ ]:
281
In [ ]:
# loop alpha 0.001,0.02, 0.5
#it'a suppoed to be lower than linear regression 
In [355]:
print(train_score)
print(test_score)
0.422840925902246
0.43466159004254656
In [356]:
airbnb_yhat = aribnb_lasso2.predict(X_test)
In [39]:
from sklearn.metrics import mean_squared_error
In [359]:
mean_squared_error(Y_test, airbnb_yhat)
Out[359]:
71963.72086475803
In [271]:
np.sqrt(71963)
Out[271]:
268.25920301081936
In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(noscale_x_train, y_train, test_size = 0.2, random_state = 462)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-efc0885b7b5a> in <module>
----> 1 X_train, X_test, Y_train, Y_test = train_test_split(noscale_x_train, y_train, test_size = 0.2, random_state = 462)

NameError: name 'noscale_x_train' is not defined
In [40]:
alphas = [0.01, 0.001, 0.002]
lasso_scores = {}
lasso_mses= []
for i in alphas:
    lasso_model = Lasso(alpha = i)
    lasso_model.fit(X_train, Y_train)
    train_score = lasso_model.score(X_train, Y_train)
    test_score = lasso_model.score(X_test, Y_test)
    lasso_scores[i] = (train_score, test_score)
    y_hat = lasso_model.predict(X_test)
    lasso_mses.append(mean_squared_error(Y_test,y_hat))
    
In [165]:
lasso_scores = {}
lasso_mses= []


for i in range(1, 51):
    
    j = 0.001 * i
    lasso_model = Lasso(alpha = j)
    lasso_model.fit(X_train, Y_train)
    train_score = lasso_model.score(X_train, Y_train)
    test_score = lasso_model.score(X_test, Y_test)
    lasso_scores[i] = (train_score, test_score)
    y_hat = lasso_model.predict(X_test)
    lasso_mses.append(mean_squared_error(Y_test,y_hat))
    
    
In [ ]:
 
In [166]:
lasso_scores
Out[166]:
{1: (0.4234278279344956, 0.43507132260639003),
 2: (0.423409796641226, 0.4350690381016721),
 3: (0.42338201124016694, 0.4350565908313695),
 4: (0.42335415817959765, 0.43504263693800327),
 5: (0.42332970206849274, 0.4350268306621843),
 6: (0.42330323914113244, 0.435007747899558),
 7: (0.42327599732399024, 0.43498790521525454),
 8: (0.42325327512450983, 0.4349791762442484),
 9: (0.42322926409695805, 0.43496888768536446),
 10: (0.42320411089683574, 0.43495579747305646),
 11: (0.4231775250025356, 0.4349414878795451),
 12: (0.42314964414202416, 0.4349252582115315),
 13: (0.4231217082881994, 0.4349118775719627),
 14: (0.4230931507752297, 0.434897990621831),
 15: (0.42306457308298273, 0.43488405007990805),
 16: (0.42303609091892236, 0.4348706408010571),
 17: (0.42300785190366585, 0.43485710231819075),
 18: (0.42297924369326334, 0.4348424343352718),
 19: (0.4229507242123708, 0.43482640284950325),
 20: (0.42292176860608166, 0.4348095649447056),
 21: (0.42289219593726457, 0.43479181772910613),
 22: (0.42286329160616143, 0.4347759310167405),
 23: (0.42283478563959365, 0.434759757781609),
 24: (0.4228084132734953, 0.43474378980277284),
 25: (0.4227819828752344, 0.43472769703407277),
 26: (0.4227547946434398, 0.43471087727744173),
 27: (0.42272726842842345, 0.43469385879663813),
 28: (0.42269918485146296, 0.43467652906127985),
 29: (0.422670614427061, 0.4346589034537728),
 30: (0.4226434162806195, 0.4346437988665799),
 31: (0.42261711419811043, 0.4346305371091609),
 32: (0.42259321953719065, 0.43461742491380606),
 33: (0.42256942837892697, 0.43460413607756365),
 34: (0.4225456893907643, 0.4345908422470668),
 35: (0.42252179726362293, 0.4345775670036277),
 36: (0.4224981627957266, 0.43456456369768226),
 37: (0.42247598152398336, 0.43455287205636095),
 38: (0.42245401771909197, 0.43454109645645655),
 39: (0.4224321407549054, 0.4345297699133832),
 40: (0.4224136674310782, 0.4345221002831131),
 41: (0.42239680559936177, 0.4345162044774781),
 42: (0.4223797674533484, 0.434510090342511),
 43: (0.42236304852147033, 0.43450508400025667),
 44: (0.42234598998767336, 0.4344996988909683),
 45: (0.4223287675274381, 0.43449427630911575),
 46: (0.42231152340447253, 0.4344890814810042),
 47: (0.4222941681805892, 0.43448386645141),
 48: (0.42227685332923137, 0.4344786940528488),
 49: (0.4222593517701426, 0.43447343063189614),
 50: (0.42224167704952376, 0.4344681356851242)}
In [167]:
lasso_mses
Out[167]:
[71911.56470601435,
 71911.85550787205,
 71913.4399604226,
 71915.21619580568,
 71917.22822683156,
 71919.65733226431,
 71922.1831705397,
 71923.2943089691,
 71924.60397229037,
 71926.27026696979,
 71928.09178055353,
 71930.1577065159,
 71931.86097060332,
 71933.62868456957,
 71935.40322040507,
 71937.11013007764,
 71938.83348653947,
 71940.7006206486,
 71942.74131936386,
 71944.88466971733,
 71947.14376912934,
 71949.16603917857,
 71951.22478162024,
 71953.2573963365,
 71955.30589595066,
 71957.44693616638,
 71959.61327261287,
 71961.8192296397,
 71964.06284917159,
 71965.98556002475,
 71967.67369122592,
 71969.34278419719,
 71971.03436234775,
 71972.72657623283,
 71974.41642411232,
 71976.07165620498,
 71977.55992234377,
 71979.05887583715,
 71980.50066746095,
 71981.47695904519,
 71982.22745486067,
 71983.00574250988,
 71983.64301570802,
 71984.3285033643,
 71985.01876102225,
 71985.68002718351,
 71986.34386485562,
 71987.0022758882,
 71987.67227344993,
 71988.3462840333]
In [503]:
np.sqrt(71988)
Out[503]:
268.3057956884271
In [43]:
alphas = [0.001, 0.02, 1]
lasso_scores = {}
lasso_mses= []
for i in alphas:
    lasso_model = Lasso(alpha = i)
    lasso_model.fit(X_train, Y_train)
    train_score = lasso_model.score(X_train, Y_train)
    test_score = lasso_model.score(X_test, Y_test)
    lasso_scores[i] = (train_score, test_score)
    y_hat = lasso_model.predict(X_test)
    lasso_mses.append(mean_squared_error(Y_test,y_hat))
In [44]:
lasso_scores
Out[44]:
{0.001: (0.4234278279344956, 0.43507132260639003),
 0.02: (0.42292176860608166, 0.4348095649447056),
 1: (0.40761892500747265, 0.4272124477551996)}
In [45]:
lasso_mses
Out[45]:
[71911.56470601435, 71944.88466971733, 72911.94583374402]

Deep learning model

In [304]:
#!pip install tensorflow
import tensorflow as tf
In [227]:
#!pip install Keras
Collecting Keras
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Requirement already satisfied: numpy>=1.9.1 in /opt/anaconda3/lib/python3.7/site-packages (from Keras) (1.17.2)
Requirement already satisfied: pyyaml in /opt/anaconda3/lib/python3.7/site-packages (from Keras) (5.1.2)
Requirement already satisfied: scipy>=0.14 in /opt/anaconda3/lib/python3.7/site-packages (from Keras) (1.3.1)
Requirement already satisfied: h5py in /opt/anaconda3/lib/python3.7/site-packages (from Keras) (2.10.0)
Requirement already satisfied: six in /opt/anaconda3/lib/python3.7/site-packages (from h5py->Keras) (1.12.0)
Installing collected packages: Keras
Successfully installed Keras-2.4.3
In [230]:
import keras
from keras.layers import Dense
from keras.models import Sequential
In [194]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
In [ ]:
from sklearn.model_selection import tranin_test_split
In [185]:
dlx_train, dlx_test, dly_train, dly_test = train_test_split(x,y, test_size = 0.2, random_state = 462)
In [249]:
from sklearn.preprocessing import StandardScaler
In [141]:
from sklearn.model_selection import StratifiedKFold
In [ ]:
 
In [200]:
k_fold = StratifiedKFold(n_splits=10, shuffle=True)
In [184]:
len(x.columns)
Out[184]:
277
In [198]:
for train, test in k_fold.split(x, y):
    print (len(train))
/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:657: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=10.
  % (min_groups, self.n_splits)), Warning)
332920
332978
333040
333144
333199
333244
333326
333364
333438
333471
In [210]:
x.head()
Out[210]:
host_is_superhost_0 host_is_superhost_1 property_type_Apartment property_type_Condominium property_type_Guesthouse property_type_House room_type_Entire home/apt room_type_Hotel room room_type_Private room room_type_Shared room ... neighbourhood_cleansed_Willowbrook neighbourhood_cleansed_Wilmington neighbourhood_cleansed_Windsor Square neighbourhood_cleansed_Winnetka neighbourhood_cleansed_Woodland Hills bathrooms bedrooms beds cleaning_fee minimum_nights
0 1 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 0 2.0 2.0 3.0 240.0 7
1 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 0 1.0 3.0 3.0 100.0 2
2 0 1 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 1.5 1.0 1.0 85.0 6
3 1 0 1 0 0 0 0 0 1 0 ... 0 0 0 0 0 1.0 1.0 1.0 100.0 1
4 1 0 1 0 0 0 1 0 0 0 ... 0 0 0 0 0 1.0 1.0 1.0 75.0 2

5 rows × 277 columns

In [211]:
y.head()
Out[211]:
0    122.0
1    168.0
2     79.0
3    140.0
4     80.0
Name: price, dtype: float64
In [ ]:
 
In [222]:
#Standardize dataset first
scaler = StandardScaler()
dlx_train = scaler.fit_transform(dlx_train.astype(np.float))
In [224]:
dlx_test = scaler.transform(dlx_test.astype(np.float))
In [ ]:
#besides dense, what else could be used here?
#change the input number 
#dropout?
In [302]:
x.shape
Out[302]:
(370236, 281)
In [222]:
n_cols = x_train.shape[1]
In [223]:
n_cols
Out[223]:
277
In [235]:
airbnb_deep_model = Sequential()

airbnb_deep_model.add(Dense(200, activation = 'relu', input_shape = (n_cols,)))

airbnb_deep_model.add(Dense(200, activation = 'relu'))
                      
airbnb_deep_model.add(Dense(200, activation = 'relu'))
                      
airbnb_deep_model.add(Dense(200, activation = 'relu'))
                      
airbnb_deep_model.add(Dense(1))
In [236]:
airbnb_deep_model.compile(optimizer = 'Adam', loss ='mean_squared_error')

let's compare this outcome with Lasso

In [240]:
#batch_size double check

airbnb_fit= airbnb_deep_model.fit(x,y,validation_split = 0.2)



#airbnb_deep_model.summary()
9256/9256 [==============================] - 15s 2ms/step - loss: 63986.5938 - val_loss: 72638.4297
In [273]:
print(np.sqrt(63986))
print(np.sqrt(72638))
252.95454137057908
269.5143780951213
In [ ]:
# predict () 
In [238]:
mse_score = airbnb_deep_model.evaluate(x,y)
11570/11570 [==============================] - 14s 1ms/step - loss: 68611.5938
In [239]:
mse_score
Out[239]:
68611.59375

Outliers

In [335]:
airbnb_la.price.describe()
Out[335]:
count    370236.000000
mean        186.830727
std         366.790916
min           0.000000
25%          75.000000
50%         113.000000
75%         189.000000
max       25000.000000
Name: price, dtype: float64
In [345]:
airbnb_la[airbnb_la['price'] > 1.5* iqr + 189].shape
Out[345]:
(30822, 13)
In [377]:
airbnb_la[airbnb_la['price'] < 1.5* iqr -75].shape
Out[377]:
(146181, 13)
In [346]:
airbnb_la.shape
Out[346]:
(370236, 13)
In [344]:
q75, q25 = np.percentile(airbnb_la['price'], [75 ,25])
iqr = q75 - q25
In [ ]:
 
In [ ]:
import seaborn as sns
sns.set_theme(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.boxplot(x=tips["total_bill"])
In [338]:
import seaborn as sns
In [339]:
sns.set_theme(style="whitegrid")
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-339-bcdee8ede155> in <module>
----> 1 sns.set_theme(style="whitegrid")

AttributeError: module 'seaborn' has no attribute 'set_theme'
In [341]:
airbnb_la.head()
Out[341]:
id host_is_superhost neighbourhood_cleansed property_type room_type accommodates bathrooms bedrooms beds amenities price cleaning_fee minimum_nights
0 109 0 Culver City Condominium Entire home/apt 6 2.0 2.0 3.0 32 122.0 240.0 7
1 344 0 Burbank House Entire home/apt 6 1.0 3.0 3.0 41 168.0 100.0 2
2 2708 1 Hollywood Apartment Private room 1 1.5 1.0 1.0 43 79.0 85.0 6
3 2732 0 Santa Monica Apartment Private room 1 1.0 1.0 1.0 12 140.0 100.0 1
4 2864 0 Bellflower Apartment Entire home/apt 2 1.0 1.0 1.0 20 80.0 75.0 2
In [342]:
box_plot = sns.boxplot(x = airbnb_la.room_type, y = airbnb_la.price)
In [ ]:
 

Visualizations

In [500]:
airbnb_la.head()
Out[500]:
id host_is_superhost neighbourhood_cleansed property_type room_type accommodates bathrooms bedrooms beds amenities price cleaning_fee minimum_nights
0 109 0 Culver City Condominium Entire home/apt 6 2.0 2.0 3.0 32 122.0 240.0 7
1 344 0 Burbank House Entire home/apt 6 1.0 3.0 3.0 41 168.0 100.0 2
2 2708 1 Hollywood Apartment Private room 1 1.5 1.0 1.0 43 79.0 85.0 6
3 2732 0 Santa Monica Apartment Private room 1 1.0 1.0 1.0 12 140.0 100.0 1
4 2864 0 Bellflower Apartment Entire home/apt 2 1.0 1.0 1.0 20 80.0 75.0 2
In [ ]:
 
In [147]:
# what property has the highest average price?

airbnb_nomissing[['property_type','price']].groupby('property_type').agg(lambda x : x.mean())
Out[147]:
price
property_type
Apartment 139.456488
Condominium 180.236875
Guesthouse 118.733868
House 251.558425
In [148]:
airbnb_nomissing[['room_type','price']].groupby('room_type').agg(lambda x : x.mean())
Out[148]:
price
room_type
Entire home/apt 242.841626
Hotel room 26.500000
Private room 78.254489
Shared room 52.523022

The proportion of super host for each property type and room type

In [245]:
import matplotlib.pyplot as plt
import seaborn as sns
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-245-5cbea1f69633> in <module>
      1 import matplotlib.pyplot as plt
      2 import seaborn as sns
----> 3 import folium

ModuleNotFoundError: No module named 'folium'
In [351]:
property_superhost = airbnb_nomissing[['property_type','host_is_superhost']].groupby('property_type').agg(lambda x : len(x == 't') )
In [436]:
property_superhost
Out[436]:
host_is_superhost
property_type
Apartment 187133
Condominium 29885
Guesthouse 29399
House 175610
In [437]:
airbnb_nomissing[['property_type','price']].groupby('property_type').agg(lambda x : x.mean())
Out[437]:
price
property_type
Apartment 139.456488
Condominium 180.236875
Guesthouse 118.733868
House 251.558425
In [ ]:
##line graph 
In [ ]:
## 
In [ ]:
 
In [352]:
room_superhost = airbnb_nomissing[['room_type','host_is_superhost']].groupby('room_type').agg(lambda x : len(x == 't') )
In [354]:
property_room_superhost =airbnb_nomissing.groupby(['property_type','room_type'])['host_is_superhost'].agg(lambda x : len(x == 't')).to_frame()
In [138]:
airbnb_with_dummies.select_dtypes(include = 'object').columns
Out[138]:
Index([], dtype='object')
In [397]:
airbnb_training.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-397-ae87ae96140e> in <module>
----> 1 airbnb_training.head()

NameError: name 'airbnb_training' is not defined
In [396]:
neighbour_property = airbnb_training.groupby('neighbourhood_cleansed')['property_type'].count().sort_values(ascending = False).to_frame()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-396-add5399651b5> in <module>
----> 1 neighbour_property = airbnb_training.groupby('neighbourhood_cleansed')['property_type'].count().sort_values(ascending = False).to_frame()

NameError: name 'airbnb_training' is not defined
In [216]:
neighbour_property = airbnb_la.groupby('neighbourhood_cleansed').agg({'price':mean})
In [219]:
neighbour_property = neighbour_property.reset_index()
In [224]:
sorted_average_price = neighbour_property.sort_values(by = 'price', ascending = False)
In [225]:
sorted_average_price
Out[225]:
neighbourhood_cleansed price
19 Bel-Air 1364.612403
133 Malibu 1154.949898
23 Beverly Crest 939.916549
221 Unincorporated Catalina Island 811.416667
101 Hollywood Hills West 683.570960
... ... ...
15 Avocado Heights 47.604651
244 West Compton 45.000000
173 Rancho Dominguez 39.000000
190 Santa Fe Springs 37.076923
241 Watts 30.287582

262 rows × 2 columns

In [395]:
neighbour_property = neighbour_property.reset_index()
neighbour_listings  = neighbour_property.rename(columns = {'property_type':'num_listings'})
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-395-f395156b9a77> in <module>
----> 1 neighbour_property = neighbour_property.reset_index()
      2 neighbour_listings  = neighbour_property.rename(columns = {'property_type':'num_listings'})

NameError: name 'neighbour_property' is not defined
In [494]:
airbnb_training.head()
Out[494]:
id host_is_superhost host_total_listings_count neighbourhood_cleansed property_type room_type accommodates bathrooms bedrooms beds amenities price cleaning_fee guests_included minimum_nights maximum_nights instant_bookable calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms
0 109 f 1.0 Culver City Condominium Entire home/apt 6 2.0 2.0 3.0 32 122.0 240.0 3 7 730 f 1 0 0
1 344 f 1.0 Burbank House Entire home/apt 6 1.0 3.0 3.0 41 168.0 100.0 6 2 14 t 1 0 0
2 2708 t 2.0 Hollywood Apartment Private room 1 1.5 1.0 1.0 43 79.0 85.0 1 6 366 t 0 2 0
3 2732 f 2.0 Santa Monica Apartment Private room 1 1.0 1.0 1.0 12 140.0 100.0 1 1 180 f 1 1 0
4 2864 f 1.0 Bellflower Apartment Entire home/apt 2 1.0 1.0 1.0 20 80.0 75.0 1 2 730 f 1 0 0
In [520]:
neighbour_property.head()
Out[520]:
index neighbourhood_cleansed property_type
0 0 Hollywood 27429
1 1 Venice 25371
2 2 Long Beach 16488
3 3 Downtown 13158
4 4 Santa Monica 11913
In [223]:
import folium
In [505]:
gjson2 = r'https://apps.gis.ucla.edu/geodata/dataset/93d71e41-6196-4ecb-9ddd-15f1a4a7630c/resource/6cde4e9e-307c-477d-9089-cae9484c8bc1/download/la-county-neighborhoods-v6.geojson'
area_map = folium.Map(location =[34.0522, -118.2437], zoom_start = 12 )
In [ ]:
 
In [506]:
area_map.choropleth(
    
    geo_data=gjson2,
    data=sorted_average_price,
    columns=['neighbourhood_cleansed','price'],
    key_on='feature.properties.name',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Average Listing price in Los Angeles'
    
)

area_map
Out[506]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [399]:
airbnb_nomissing['id'].value_counts().shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-399-4f23e96396e9> in <module>
----> 1 airbnb_nomissing['id'].value_counts().shape

NameError: name 'airbnb_nomissing' is not defined
In [146]:
len(airbnb_nomissing['id'])
Out[146]:
422027
In [161]:
airbnb_nomissing.zipcode.dtypes
Out[161]:
CategoricalDtype(categories=[        90.0,      90001.0,      90002.0,      90003.0,
                       90004.0,      90005.0,      90006.0,      90007.0,
                       90008.0,      90010.0,
                  ...
                    'CA 90212',   'CA 90272',   'CA 90292',   'CA 90403',
                    'CA 90405',   'CA 91765',    'CA91780',    'Ca91745',
                  'Near 91304',    'ca91748'],
                 ordered=False)
In [162]:
zipcode_priceavg = airbnb_nomissing[['zipcode','price']].groupby('zipcode').agg(lambda x: x.mean())
#missing_stats.sort_values(by = "missing_num", ascending  =False, inplace = True)
In [165]:
zipcode_priceavg.sort_values(by = 'price',ascending = False, inplace = True)
In [166]:
zipcode_priceavg.head()
Out[166]:
price
zipcode
-- default zip code -- 4427.200000
93063 1529.000000
90077 1280.607465
91361.0 1158.750000
90265.0 1099.192661
In [167]:
if "default zip code" in airbnb_nomissing['zipcode']:
    print("yes")
In [171]:
airbnb_nomissing[airbnb_nomissing['zipcode'] == '-- default zip code --'][['zipcode','price',
                                                                           'property_type','room_type']]
Out[171]:
zipcode price property_type room_type
24884 -- default zip code -- 2344.0 Apartment Entire home/apt
34601 -- default zip code -- 1813.0 Apartment Entire home/apt
34602 -- default zip code -- 4697.0 Apartment Entire home/apt
34603 -- default zip code -- 3071.0 Apartment Entire home/apt
37073 -- default zip code -- 10211.0 Apartment Entire home/apt
In [356]:
airbnb_nomissing.head()
Out[356]:
id host_is_superhost host_total_listings_count zipcode latitude longitude property_type room_type accommodates bathrooms bedrooms beds amenities price cleaning_fee guests_included minimum_nights maximum_nights instant_bookable calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms
0 109 f 1.0 90230 33.982095 -118.384935 Condominium Entire home/apt 6 2.0 2.0 3.0 32 122.0 240.0 3 7 730 f 1 0 0
1 344 f 1.0 91505 34.165616 -118.334582 House Entire home/apt 6 1.0 3.0 3.0 41 168.0 100.0 6 2 14 t 1 0 0
2 2708 t 2.0 90046 34.097676 -118.346023 Apartment Private room 1 1.5 1.0 1.0 43 79.0 85.0 1 6 366 t 0 2 0
3 2732 f 2.0 90405 34.004750 -118.481266 Apartment Private room 1 1.0 1.0 1.0 12 140.0 100.0 1 1 180 f 1 1 0
4 2864 f 1.0 90706 33.876189 -118.113968 Apartment Entire home/apt 2 1.0 1.0 1.0 20 80.0 75.0 1 2 730 f 1 0 0
In [ ]: